├── .gitignore
├── README.md
├── horovod-images
    ├── DockerfileAnalyticHorovodGpu
    ├── DockerfileDriverHorovodGpu
    ├── DockerfileExecutorHorovodGpu
    ├── README.md
    └── mlflow_venv
    │   └── conda.yaml
├── mlflow_mlprojects
    ├── mlflow-devEnv-archetype
    │   ├── Readme.md
    │   ├── __init__.py
    │   ├── clean_venvs_mlflow_runs.sh
    │   ├── create_vEnv_mlproject_command.sh
    │   ├── create_vEnv_spark_inference.sh
    │   ├── data
    │   │   └── train.csv
    │   ├── doc
    │   │   └── .keep
    │   ├── launcher_mlproject_command.py
    │   ├── launcher_spark_inference.py
    │   ├── mlflow_runs
    │   │   └── .keep
    │   ├── mlproject
    │   │   ├── MLproject
    │   │   ├── __init__.py
    │   │   ├── conda.yaml
    │   │   └── train.py
    │   ├── spark_inference
    │   │   ├── conda.yaml
    │   │   └── model
    │   │   │   └── .keep
    │   ├── utils
    │   │   └── install_conda.sh
    │   └── venvs
    │   │   ├── launcher
    │   │       └── .keep
    │   │   ├── mlproject
    │   │       └── .keep
    │   │   └── spark_inference
    │   │       └── .keep
    └── mlflow-devEnv-custom-model
    │   ├── __init__.py
    │   ├── clean_venvs_mlflow_runs.sh
    │   ├── create_vEnv_mlproject_command.sh
    │   ├── create_vEnv_spark_inference.sh
    │   ├── data
    │       └── train.csv
    │   ├── doc
    │       └── .keep
    │   ├── launcher_mlproject_command.py
    │   ├── launcher_spark_inference.py
    │   ├── mlflow_runs
    │       └── .keep
    │   ├── mlproject
    │       ├── MLproject
    │       ├── __init__.py
    │       ├── conda.yaml
    │       └── train.py
    │   ├── spark_inference
    │       ├── conda.yaml
    │       └── model
    │       │   ├── .keep
    │       │   ├── MLmodel
    │       │   ├── conda.yaml
    │       │   └── python_model.pkl
    │   ├── utils
    │       └── install_conda.sh
    │   └── venvs
    │       ├── launcher
    │           └── .keep
    │       ├── mlproject
    │           └── .keep
    │       └── spark_inference
    │           └── .keep
├── rocket-extensions
    ├── Readme.md
    ├── old-extensions
    │   ├── input-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       ├── GeneratorXDLiteInputStepBatch.scala
    │   │   │                       ├── GeneratorXDLiteInputStepStreaming.scala
    │   │   │                       └── MetadataTestXDLiteInputStepBatch.scala
    │   ├── output-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── LoggerXDLiteOutputStep.scala
    │   ├── transformation-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── scala
    │   │   │       │   └── com
    │   │   │       │       └── stratio
    │   │   │       │           └── sparta
    │   │   │       │               ├── properties
    │   │   │       │                   └── ValidatePropertiesMap.scala
    │   │   │       │               └── transformations
    │   │   │       │                   ├── repartition
    │   │   │       │                       ├── RepartitionXDLiteTransformStepBatch.scala
    │   │   │       │                       └── RepartitionXDLiteTransformStepStreaming.scala
    │   │   │       │                   └── tokenizer
    │   │   │       │                       ├── TokenizerTransformStepBatch.scala
    │   │   │       │                       └── TokenizerTransformStepStreaming.scala
    │   │   │   └── test
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── transformations
    │   │   │                           └── tokenizer
    │   │   │                               └── TokenizerTransformStepBatchTest.scala
    │   └── udf
    │   │   ├── pom.xml
    │   │   └── src
    │   │       └── main
    │   │           └── scala
    │   │               ├── com
    │   │                   └── stratio
    │   │                   │   └── sparta
    │   │                   │       └── SpartaExampleUDFs.scala
    │   │               └── org
    │   │                   └── apache
    │   │                       └── spark
    │   │                           └── sql
    │   │                               └── sparta
    │   │                                   └── udf
    │   │                                       └── GetDenseVectorUDF.scala
    ├── rocket-1.0.0-SDK
    │   ├── input-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       ├── GeneratorXDLiteInputStepBatch.scala
    │   │   │                       └── GeneratorXDLiteInputStepStreaming.scala
    │   ├── output-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── LoggerXDLiteOutputStep.scala
    │   ├── packaged-jars
    │   │   ├── custom-lite-input-xd-1.0.0-SNAPSHOT.jar
    │   │   ├── custom-lite-output-xd-1.0.0-SNAPSHOT.jar
    │   │   └── custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar
    │   └── transformation-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │       ├── main
    │   │           └── scala
    │   │           │   └── com
    │   │           │       └── stratio
    │   │           │           └── sparta
    │   │           │               ├── properties
    │   │           │                   └── ValidatePropertiesMap.scala
    │   │           │               └── transformations
    │   │           │                   ├── repartition
    │   │           │                       ├── RepartitionXDLiteTransformStepBatch.scala
    │   │           │                       └── RepartitionXDLiteTransformStepStreaming.scala
    │   │           │                   └── tokenizer
    │   │           │                       ├── TokenizerTransformStepBatch.scala
    │   │           │                       └── TokenizerTransformStepStreaming.scala
    │   │       └── test
    │   │           └── scala
    │   │               └── com
    │   │                   └── stratio
    │   │                       └── sparta
    │   │                           └── transformations
    │   │                               └── tokenizer
    │   │                                   └── TokenizerTransformStepBatchTest.scala
    ├── rocket-1.1.0-SDK
    │   └── input-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │       └── main
    │   │           └── scala
    │   │               └── com
    │   │                   └── stratio
    │   │                       └── sparta
    │   │                           ├── GeneratorXDLiteInputStepBatch.scala
    │   │                           ├── GeneratorXDLiteInputStepStreaming.scala
    │   │                           └── MetadataTestXDLiteInputStepBatch.scala
    ├── rocket-2.2.0-SDK
    │   ├── input-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       ├── GeneratorXDLiteInputStepBatch.scala
    │   │   │                       ├── GeneratorXDLiteInputStepStreaming.scala
    │   │   │                       ├── MetadataTestXDLiteInputStepBatch.scala
    │   │   │                       ├── ReportLogTestXDLiteInputStepBatch.scala
    │   │   │                       └── ReportLogTestXDLiteInputStepStreaming.scala
    │   ├── output-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── LoggerXDLiteOutputStep.scala
    │   ├── packaged-jars
    │   │   ├── custom-lite-input-xd-2.2.0-SNAPSHOT.jar
    │   │   ├── custom-lite-output-xd-2.2.0-SNAPSHOT.jar
    │   │   ├── custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar
    │   │   └── custom-lite-udf-2.2.0-SNAPSHOT.jar
    │   ├── transformation-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── scala
    │   │   │       │   └── com
    │   │   │       │       └── stratio
    │   │   │       │           └── sparta
    │   │   │       │               ├── properties
    │   │   │       │                   └── ValidatePropertiesMap.scala
    │   │   │       │               └── transformations
    │   │   │       │                   ├── column
    │   │   │       │                       └── AddColumnXDLiteTransformStepBatch.scala
    │   │   │       │                   ├── repartition
    │   │   │       │                       ├── RepartitionXDLiteTransformStepBatch.scala
    │   │   │       │                       └── RepartitionXDLiteTransformStepStreaming.scala
    │   │   │       │                   └── tokenizer
    │   │   │       │                       ├── TokenizerTransformStepBatch.scala
    │   │   │       │                       └── TokenizerTransformStepStreaming.scala
    │   │   │   └── test
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── transformations
    │   │   │                           └── tokenizer
    │   │   │                               └── TokenizerTransformStepBatchTest.scala
    │   └── udf
    │   │   ├── pom.xml
    │   │   └── src
    │   │       └── main
    │   │           └── scala
    │   │               ├── com
    │   │                   └── stratio
    │   │                   │   └── sparta
    │   │                   │       └── SpartaExampleUDFs.scala
    │   │               └── org
    │   │                   └── apache
    │   │                       └── spark
    │   │                           └── sql
    │   │                               └── sparta
    │   │                                   └── udf
    │   │                                       └── GetDenseVectorUDF.scala
    ├── rocket-2.3.0-SDK
    │   ├── input-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       ├── GeneratorXDLiteInputStepBatch.scala
    │   │   │                       ├── GeneratorXDLiteInputStepStreaming.scala
    │   │   │                       ├── MetadataTestXDLiteInputStepBatch.scala
    │   │   │                       ├── ReportLogTestXDLiteInputStepBatch.scala
    │   │   │                       └── ReportLogTestXDLiteInputStepStreaming.scala
    │   ├── output-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       ├── JdbcWithLineageXDLiteOutputStep.scala
    │   │   │                       └── LoggerXDLiteOutputStep.scala
    │   ├── transformation-lite-xd
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   ├── main
    │   │   │       └── scala
    │   │   │       │   └── com
    │   │   │       │       └── stratio
    │   │   │       │           └── sparta
    │   │   │       │               ├── properties
    │   │   │       │                   └── ValidatePropertiesMap.scala
    │   │   │       │               └── transformations
    │   │   │       │                   ├── column
    │   │   │       │                       └── AddColumnXDLiteTransformStepBatch.scala
    │   │   │       │                   ├── repartition
    │   │   │       │                       ├── RepartitionXDLiteTransformStepBatch.scala
    │   │   │       │                       └── RepartitionXDLiteTransformStepStreaming.scala
    │   │   │       │                   └── tokenizer
    │   │   │       │                       ├── TokenizerTransformStepBatch.scala
    │   │   │       │                       └── TokenizerTransformStepStreaming.scala
    │   │   │   └── test
    │   │   │       └── scala
    │   │   │           └── com
    │   │   │               └── stratio
    │   │   │                   └── sparta
    │   │   │                       └── transformations
    │   │   │                           └── tokenizer
    │   │   │                               └── TokenizerTransformStepBatchTest.scala
    │   └── udf
    │   │   ├── pom.xml
    │   │   └── src
    │   │       └── main
    │   │           └── scala
    │   │               ├── com
    │   │                   └── stratio
    │   │                   │   └── sparta
    │   │                   │       └── SpartaExampleUDFs.scala
    │   │               └── org
    │   │                   └── apache
    │   │                       └── spark
    │   │                           └── sql
    │   │                               └── sparta
    │   │                                   └── udf
    │   │                                       └── GetDenseVectorUDF.scala
    └── rocket-3.0.0-SDK
    │   ├── Readme.md
    │   ├── input-lite-xd
    │       ├── pom.xml
    │       └── src
    │       │   └── main
    │       │       └── scala
    │       │           └── com
    │       │               └── stratio
    │       │                   └── sparta
    │       │                       ├── GeneratorXDLiteInputStepBatch.scala
    │       │                       ├── GeneratorXDLiteInputStepHybrid.scala
    │       │                       ├── GeneratorXDLiteInputStepStreaming.scala
    │       │                       ├── MetadataTestXDLiteInputStepBatch.scala
    │       │                       ├── ReportLogTestXDLiteInputStepBatch.scala
    │       │                       ├── ReportLogTestXDLiteInputStepStreaming.scala
    │       │                       └── StreamGeneratorXDLiteInputStepHybrid.scala
    │   ├── lineage-qrs
    │       ├── pom.xml
    │       └── src
    │       │   └── main
    │       │       └── scala
    │       │           └── com
    │       │               └── stratio
    │       │                   └── sparta
    │       │                       └── CustomLineageQrs.scala
    │   ├── output-lite-xd
    │       ├── pom.xml
    │       └── src
    │       │   └── main
    │       │       └── scala
    │       │           └── com
    │       │               └── stratio
    │       │                   └── sparta
    │       │                       ├── JdbcWithLineageXDLiteOutputStep.scala
    │       │                       └── LoggerXDLiteOutputStep.scala
    │   ├── packaged-jars
    │       ├── custom-lite-input-xd-3.0.0-SNAPSHOT.jar
    │       ├── custom-lite-output-xd-3.0.0-SNAPSHOT.jar
    │       └── custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar
    │   ├── transformation-lite-xd
    │       ├── pom.xml
    │       └── src
    │       │   ├── main
    │       │       └── scala
    │       │       │   └── com
    │       │       │       └── stratio
    │       │       │           └── sparta
    │       │       │               ├── properties
    │       │       │                   └── ValidatePropertiesMap.scala
    │       │       │               └── transformations
    │       │       │                   ├── column
    │       │       │                       ├── AddColumnXDLiteTransformStepBatch.scala
    │       │       │                       └── AddColumnXDLiteTransformStepHybrid.scala
    │       │       │                   ├── repartition
    │       │       │                       ├── RepartitionXDLiteTransformStepBatch.scala
    │       │       │                       ├── RepartitionXDLiteTransformStepHybrid.scala
    │       │       │                       └── RepartitionXDLiteTransformStepStreaming.scala
    │       │       │                   └── tokenizer
    │       │       │                       ├── TokenizerTransformStepBatch.scala
    │       │       │                       └── TokenizerTransformStepStreaming.scala
    │       │   └── test
    │       │       └── scala
    │       │           └── com
    │       │               └── stratio
    │       │                   └── sparta
    │       │                       └── transformations
    │       │                           └── tokenizer
    │       │                               └── TokenizerTransformStepBatchTest.scala
    │   └── udf
    │       ├── pom.xml
    │       └── src
    │           └── main
    │               └── scala
    │                   ├── com
    │                       └── stratio
    │                       │   └── sparta
    │                       │       └── SpartaExampleUDFs.scala
    │                   └── org
    │                       └── apache
    │                           └── spark
    │                               └── sql
    │                                   └── sparta
    │                                       └── udf
    │                                           └── GetDenseVectorUDF.scala
├── rocket-python-extensions
    ├── conda-pack-extensions
    │   ├── conda.yaml
    │   └── do_conda_pack.sh
    ├── private-pypi-repository
    │   ├── Readme.md
    │   ├── dist
    │   │   └── rocket_python_examples-0.1.0.tar.gz
    │   ├── rocket_python_examples
    │   │   ├── __init__.py
    │   │   └── test.py
    │   └── setup.py
    └── pyspark-native-extensions
    │   └── qa-examples
    │       ├── Readme.md
    │       ├── same_module_test
    │           ├── make_packages.sh
    │           ├── module_build_1
    │           │   ├── do_package.sh
    │           │   └── my_module
    │           │   │   ├── __init__.py
    │           │   │   └── user.py
    │           ├── module_build_2
    │           │   ├── do_package.sh
    │           │   └── my_module
    │           │   │   ├── __init__.py
    │           │   │   └── user.py
    │           ├── user1_module.zip
    │           └── user2_module.zip
    │       └── version_test
    │           ├── v1
    │               ├── make_packages.sh
    │               ├── test_pyfile_egg_from_hdfs
    │               │   ├── do_package.sh
    │               │   ├── setup.py
    │               │   └── test_pyfile_egg_pkg_from_hdfs
    │               │   │   ├── __init__.py
    │               │   │   └── test_pyfile_egg.py
    │               ├── test_pyfile_egg_from_http
    │               │   ├── do_package.sh
    │               │   ├── setup.py
    │               │   └── test_pyfile_egg_pkg_from_http
    │               │   │   ├── __init__.py
    │               │   │   └── test_pyfile_egg.py
    │               ├── test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg
    │               ├── test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg
    │               ├── test_pyfile_from_hdfs.py
    │               ├── test_pyfile_from_http.py
    │               ├── test_pyfile_zip_from_hdfs
    │               │   ├── do_package.sh
    │               │   └── test_pyfile_zip_pkg_from_hdfs
    │               │   │   ├── __init__.py
    │               │   │   └── test_pyfile_zip.py
    │               ├── test_pyfile_zip_from_http
    │               │   ├── do_package.sh
    │               │   └── test_pyfile_zip_pkg_from_http
    │               │   │   ├── __init__.py
    │               │   │   └── test_pyfile_zip.py
    │               ├── test_pyfile_zip_pkg_from_hdfs.zip
    │               └── test_pyfile_zip_pkg_from_http.zip
    │           └── v2
    │               ├── make_packages.sh
    │               ├── test_pyfile_egg_from_hdfs
    │                   ├── do_package.sh
    │                   ├── setup.py
    │                   └── test_pyfile_egg_pkg_from_hdfs
    │                   │   ├── __init__.py
    │                   │   └── test_pyfile_egg.py
    │               ├── test_pyfile_egg_from_http
    │                   ├── do_package.sh
    │                   ├── setup.py
    │                   └── test_pyfile_egg_pkg_from_http
    │                   │   ├── __init__.py
    │                   │   └── test_pyfile_egg.py
    │               ├── test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg
    │               ├── test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg
    │               ├── test_pyfile_from_hdfs.py
    │               ├── test_pyfile_from_http.py
    │               ├── test_pyfile_zip_from_hdfs
    │                   ├── do_package.sh
    │                   └── test_pyfile_zip_pkg_from_hdfs
    │                   │   ├── __init__.py
    │                   │   └── test_pyfile_zip.py
    │               ├── test_pyfile_zip_from_http
    │                   ├── do_package.sh
    │                   └── test_pyfile_zip_pkg_from_http
    │                   │   ├── __init__.py
    │                   │   └── test_pyfile_zip.py
    │               ├── test_pyfile_zip_pkg_from_hdfs.zip
    │               └── test_pyfile_zip_pkg_from_http.zip
├── rocket-spark-ml-custom-stages
    ├── rocket-pyspark-ml
    │   ├── create_pipeline_estimator.py
    │   ├── create_pipeline_model.py
    │   ├── dist
    │   │   └── rocket_pyspark_ml-0.1.0.tar.gz
    │   ├── readme.md
    │   ├── rocket_pyspark_ml
    │   │   ├── __init__.py
    │   │   ├── simple_custom_estimator.py
    │   │   ├── simple_custom_transformer.py
    │   │   └── test.py
    │   └── setup.py
    └── rocket-spark-ml
    │   ├── dist
    │       └── rocketSparkMl-0.1.0-SNAPSHOT.jar
    │   ├── pom.xml
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── org
    │           │       └── apache
    │           │           └── spark
    │           │               └── ml
    │           │                   └── rocket
    │           │                       └── features
    │           │                           ├── SimpleCustomEstimator.scala
    │           │                           └── SimpleCustomTransformer.scala
    │       └── test
    │           └── scala
    │               └── org
    │                   └── apache
    │                       └── spark
    │                           └── ml
    │                               └── rocket
    │                                   ├── RocketSparkMlBeforeAndAfterAll.scala
    │                                   ├── RocketSparkMlFunSuite.scala
    │                                   └── features
    │                                       └── SimpleCustomEstimatorTest.scala
├── scripts
    ├── README.md
    ├── input.json
    ├── schema_convert.py
    └── sso.py
└── sparta-plugins
    ├── input-lite-xd
        ├── pom.xml
        └── src
        │   └── main
        │       └── scala
        │           └── com
        │               └── stratio
        │                   └── sparta
        │                       ├── GeneratorXDLiteInputStepBatch.scala
        │                       └── GeneratorXDLiteInputStepStreaming.scala
    ├── output-lite-xd
        ├── pom.xml
        └── src
        │   └── main
        │       └── scala
        │           └── com
        │               └── stratio
        │                   └── sparta
        │                       └── LoggerXDLiteOutputStep.scala
    ├── transformation-lite-xd
        ├── pom.xml
        └── src
        │   ├── main
        │       └── scala
        │       │   └── com
        │       │       └── stratio
        │       │           └── sparta
        │       │               ├── properties
        │       │                   └── ValidatePropertiesMap.scala
        │       │               └── transformations
        │       │                   ├── repartition
        │       │                       ├── RepartitionXDLiteTransformStepBatch.scala
        │       │                       └── RepartitionXDLiteTransformStepStreaming.scala
        │       │                   └── tokenizer
        │       │                       ├── TokenizerTransformStepBatch.scala
        │       │                       └── TokenizerTransformStepStreaming.scala
        │   └── test
        │       └── scala
        │           └── com
        │               └── stratio
        │                   └── sparta
        │                       └── transformations
        │                           └── tokenizer
        │                               └── TokenizerTransformStepBatchTest.scala
    └── udf
        ├── pom.xml
        └── src
            └── main
                └── scala
                    ├── com
                        └── stratio
                        │   └── sparta
                        │       └── SpartaExampleUDFs.scala
                    └── org
                        └── apache
                            └── spark
                                └── sql
                                    └── sparta
                                        └── udf
                                            └── GetDenseVectorUDF.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | # use glob syntax.
 2 | syntax: glob
 3 | *.ser
 4 | *.class
 5 | *~
 6 | *.bak
 7 | #*.off
 8 | *.old
 9 | 
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 | 
17 | # idea
18 | .idea/
19 | *.iml
20 | 
21 | # building
22 | target
23 | build
24 | null
25 | tmp
26 | temp
27 | test-output
28 | build.log
29 | 
30 | # other scm
31 | .svn
32 | .CVS
33 | .hg*
34 | 
35 | # switch to regexp syntax.
36 | #  syntax: regexp
37 | #  ^\.pc/
38 | 
39 | #SHITTY output not in target directory
40 | /dependency-reduced-pom.xml
41 | examples/scripts/temp.json
42 | 
43 | # Sparkta specifics
44 | logs
45 | /default
46 | checkpoint
47 | node
48 | node_modules
49 | bower_components
50 | 
51 | # Documentation autogenerated
52 | javadoc
53 | apidocs
54 | 


--------------------------------------------------------------------------------
/horovod-images/README.md:
--------------------------------------------------------------------------------
 1 | # rocket-distributed-deep-learning
 2 | 
 3 | This directory contains Dockerfiles that generates sample docker images extending Rocket Driver, Rocket Executor and Analytic Intelligence images, by adding and environment with [Horovod](https://github.com/horovod/horovod/tree/v0.28.1) library compiled with support for Apache Spark, Tensorflow, Keras and PyTorch. 
 4 | 
 5 | ---
 6 | >**IMPORTANT NOTICE:**
 7 | >
 8 | >These images **are not intended** to be used in **production environments**. They are intended to be used as a reference for building your own images with the desired libraries and dependencies.
 9 | >
10 | >The provided images has been successfully tested in an environment with nodes providing:
11 | >* Nvidia Tesla T4 GPUs
12 | >* Cuda 12.0
13 | >* cuDNN 8.9.0
14 | >* Nvidia Driver 520.61.05
15 | >
16 | >We do not guarantee that these images will work in environments with different GPU vendor or drivers version.
17 | ---
18 | 
19 | ## Kyverno policies
20 | In order to deploy Pods with these images, it is necessary to update the Kyverno policies of your namespaces (Rocket and Intelligence) by adding the new image names:
21 | 
22 | ### Rocket
23 | _restrict-rocket-images_:
24 | 
25 |                   containers:
26 |                     - image: >-
27 |                         */rocket-api:* | */rocket-driver* | */rocket-executor* |
28 |                         */rocket-ml-prediction-server:* |
29 |                         */rocket-mleap-microservice:* |
30 |                         */rocket-mlflow-microservice:* |
31 |                         */rocket-r-mlflow-microservice:* | */stratio-spark:*
32 | 
33 | ### Intelligence
34 | _restrict-intelligence-images_:
35 | 
36 |                   containers:
37 |                     - image: >-
38 |                         */intelligence-environment:* | */analytic-environment:*
39 |                         | */analytic-environment-light:* |
40 |                         */analytic-environment-horovod-gpu:* |
41 |                         */intelligence-backup-restore:* | */stratio-spark:*
42 | 
43 | Add the pattern matching your image names.


--------------------------------------------------------------------------------
/horovod-images/mlflow_venv/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: rocket-tensorflow-hdfs
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 |   - nodefaults
 6 | 
 7 | dependencies:
 8 |   - python=3.9.7
 9 |   - pip=21.2.4
10 |   - pip:
11 |       - mlflow==2.6.0
12 |       - tensorflow==2.11.0
13 |       - tensorflow-io==0.25.0


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/__init__.py


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/clean_venvs_mlflow_runs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | VENVS_PATH=$WD/venvs
 5 | MLFLOW_RUNS_PATH=$WD/mlflow_runs
 6 | 
 7 | read -p "Executing rm -rf ${MLFLOW_RUNS_PATH}/* - (y/n)?" choice
 8 | case "$choice" in
 9 | y | Y)
10 |   rm -rf "${MLFLOW_RUNS_PATH:?}"/*
11 |   rm -rf "${MLFLOW_RUNS_PATH:?}"/.trash
12 |   ;;
13 | *)
14 |   echo "Skipping"
15 |   ;;
16 | esac
17 | 
18 | read -p "Executing rm -rf ${VENVS_PATH}/{launcher & mlproject & spark_inference}/* - (y/n)?" choice
19 | case "$choice" in
20 | y | Y)
21 |   rm -rf "${VENVS_PATH:?}"/launcher/*
22 |   rm -rf "${VENVS_PATH:?}"/mlproject/*
23 |   rm -rf "${VENVS_PATH:?}"/spark_inference/*
24 |   ;;
25 | *)
26 |   echo "Skipping"
27 |   ;;
28 | esac
29 | 
30 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/create_vEnv_mlproject_command.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | CONDA_YAML_PATH=$WD/mlproject/conda.yaml
 5 | VENV_PATH=$WD/venvs/mlproject
 6 | 
 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}"
 8 | 
 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH"
10 | 
11 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/create_vEnv_spark_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | CONDA_YAML_PATH=$WD/spark_inference/conda.yaml
 5 | VENV_PATH=$WD/venvs/spark_inference
 6 | 
 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}"
 8 | 
 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH"
10 | 
11 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/data/train.csv:
--------------------------------------------------------------------------------
1 | <!--------- Insert your data ---------->


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/doc/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/doc/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/launcher_mlproject_command.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mlflow
 3 | from mlflow.tracking import MlflowClient
 4 | 
 5 | # TODO - Importing python script ~ MLproject file: command: "python train.py --train-data..."
 6 | from mlproject import train
 7 | 
 8 | # --------------------------------------------------------------------------------
 9 | # Note: This python script must be launched using the virtual environment
10 | #       defined in MLproject through conda.yaml file:
11 | #
12 | # Note: This v.env has been pre-created at venvs/mlproject
13 | # --------------------------------------------------------------------------------
14 | 
15 | # Current directory
16 | wd = os.path.abspath(os.path.dirname(__file__))
17 | 
18 | # Reading data ~ Rocket setup
19 | input_csv = os.path.join(wd, 'data', 'train.csv')
20 | 
21 | # Creating experiment ~ Mlflow launcher
22 | mlflow.set_tracking_uri("file://{}".format(os.path.join(wd, 'mlflow_runs/')))
23 | client = MlflowClient(
24 |     tracking_uri="file://{}".format(os.path.join(wd, 'mlflow_runs'))
25 | )
26 | experiment_id = "local"
27 | if not client.get_experiment_by_name(experiment_name):
28 |     client.create_experiment(experiment_name)
29 | 
30 | # Executing command defined in MLproject file ~ Mlflow launcher
31 | train.main(
32 |     [
33 |         "--training-data={}".format(input_csv),
34 |         # TODO - all command line arguments that accept python script used in MLproject entrypoint/command
35 |         ...
36 |     ]
37 | )
38 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/launcher_spark_inference.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # => Working directory
 5 | wd = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | # => Setting spark environment ~ Rocket integration
 8 | if not os.getenv('SPARK_HOME'):
 9 |     os.environ['SPARK_HOME'] = "XXXXXXXXXXX"
10 | spark_home = os.environ.get('SPARK_HOME', None)
11 | # Add pyspark and py4j to path.
12 | sys.path.insert(0, spark_home + "/python")
13 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.9-src.zip'))
14 | 
15 | from pyspark.sql import SparkSession
16 | import mlflow
17 | 
18 | # => Creating a pyspark session ~ Rocket integration
19 | spark = SparkSession.builder.master("local[*]")\
20 |     .appName("Debugging Spark-Mlflow integration") \
21 |     .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
22 |     .getOrCreate()
23 | 
24 | # => Reading data ~ Rocket integration
25 | df = spark.read.csv(
26 |     path=os.path.join(wd, 'data', 'train.csv'),
27 |     header=True,
28 |     inferSchema=True
29 | )
30 | 
31 | # => Mlflow logged model path ~ Rocket integration
32 | modelDirPath = os.path.join(wd, 'spark_inference', 'model')
33 | # · Loading model
34 | loaded_model = mlflow.pyfunc.load_model(modelDirPath)
35 | 
36 | 
37 | # => Constructing UDF ~ Rocket integration
38 | #     · We need input features, output column name and output column type
39 | features = None
40 | output_spark_schema = None
41 | 
42 | # Try to use model signature to infer this parameters
43 | if loaded_model.metadata.signature:
44 |     # Input features
45 |     input_signature = loaded_model.metadata.signature.inputs
46 |     features = [s.name for s in input_signature.inputs]
47 |     # Output column name & type
48 |     output_signature = loaded_model.metadata.signature.outputs
49 |     output_spark_schema = output_signature.as_spark_schema()
50 | 
51 | 
52 | # · Input features
53 | if not features:
54 |     features = ["XXXXXXXXXX"]  # Must be defined manually if your logged model do not incorporate signature
55 | print("Input features for UDF: {}".format(features))
56 | 
57 | # · Output column name & type
58 | if not output_spark_schema:
59 |     predictionColumnName = "XXXXXX"  # Must be defined manually if your logged model do not incorporate signature
60 |     predictionColumnType = "XXXXXX"  # Must be defined manually if your logged model do not incorporate signature
61 | else:
62 |     print("Spark schema: {}".format(output_spark_schema))
63 |     predictionColumnName = output_spark_schema[0].name
64 |     predictionColumnType = output_spark_schema[0].dataType
65 | 
66 | print("Prediction column name for UDF: {}".format(predictionColumnName))
67 | print("Prediction column type for UDF: {}".format(predictionColumnType))
68 | 
69 | prediction_udf = mlflow.pyfunc.spark_udf(spark, modelDirPath, result_type=predictionColumnType)
70 | 
71 | # => Making predictions ~ Rocket integration
72 | predictionDf = df.withColumn(predictionColumnName, prediction_udf(*features))
73 | predictionDf.show()
74 | 
75 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/mlflow_runs/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/mlflow_runs/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/MLproject:
--------------------------------------------------------------------------------
 1 | name: XXXXXXXX
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       training_data: string
 9 | 
10 |     command: python train.py --training-data={training_data}


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/__init__.py


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/conda.yaml:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------
 2 | #  Definition file for a python Conda managed virtual environment
 3 | # -----------------------------------------------------------------
 4 | 
 5 | # V.env name
 6 | name: XXXXXXXXXX
 7 | 
 8 | # Conda channels (repositories) used to retrieve python packages
 9 | channels:
10 |   - defaults
11 | 
12 | # Dependencies
13 | dependencies:
14 |   # From conda repositories (channels)
15 |   - python=3.7.6
16 |   - pip=20.2.2 # <-- pip: python package manager
17 |   - pip:
18 |       # Python packages managed with Pip instead of Conda
19 |       - mlflow==1.15.0


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def parse_args(argv):
 7 |     """
 8 |     Parses python command line input arguments (defined in MLproject file at command section)
 9 |     """
10 |     parser = argparse.ArgumentParser(description='Mlflow example')
11 |     parser.add_argument('--training-data', type=str, help='training data set in csv')
12 |     return parser.parse_args(argv)
13 | 
14 | 
15 | def main(argv):
16 |     """ Data """
17 |     args = parse_args(argv)  # mandatory
18 |     pd_data = pd.read_csv(args.training_data)  # mandatory
19 | 
20 |     """ Tracking """
21 |     with mlflow.start_run() as run:
22 |         print('MFlown run {}'.format(run.info))
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     main(sys.argv[1:])
27 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/conda.yaml:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------------------------------------------
 2 | # Python V. env to be used in pySpark in order to make predictions with a python_flavour MlFlow model
 3 | #
 4 | # Example:
 5 | #   name: mlflow-env
 6 | #   channels:
 7 | #   - defaults
 8 | #   - conda-forge
 9 | #   dependencies:
10 | #   - python=3.7.6
11 | #   - pip=20.2.2
12 | #   - pip:
13 | #     - mlflow==1.15.0
14 | #     - scikit-learn==0.22.1
15 | #     - cloudpickle==2.0.0
16 | #     - pyarrow==5.0.0
17 | #
18 | # Note:
19 | #   · Pre-requisite: binary distribution of Spark (SPARK_HOME env. var pointing to spark directory)
20 | #   · pyspark and py4j dependencies are provided at runtime (getting them from Spark binary distribution)
21 | #   · pyarrow is necessary in order to use Pandas UDF in Spark --> Mlflow do not include in it MLModel conda.yaml
22 | # -----------------------------------------------------------------------------------------------------------------
23 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/model/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/model/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/utils/install_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # · Install conda 4.8.3
 4 | CONDA_DIR=/opt/conda
 5 | 
 6 | curl -LO http://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh
 7 | mkdir -p $CONDA_DIR
 8 | bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -f -b -p $CONDA_DIR
 9 | rm Miniconda3-py37_4.8.3-Linux-x86_64.sh
10 | conda install --quiet --yes conda==4.8.3
11 | 
12 | # · Configuring conda
13 | conda config --system --set auto_update_conda false
14 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/launcher/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/launcher/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/mlproject/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/mlproject/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/spark_inference/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/spark_inference/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/__init__.py


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/clean_venvs_mlflow_runs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | VENVS_PATH=$WD/venvs
 5 | MLFLOW_RUNS_PATH=$WD/mlflow_runs
 6 | 
 7 | read -p "Executing rm -rf ${MLFLOW_RUNS_PATH}/* - (y/n)?" choice
 8 | case "$choice" in
 9 | y | Y)
10 |   rm -rf "${MLFLOW_RUNS_PATH:?}"/*
11 |   rm -rf "${MLFLOW_RUNS_PATH:?}"/.trash
12 |   ;;
13 | *)
14 |   echo "Skipping"
15 |   ;;
16 | esac
17 | 
18 | read -p "Executing rm -rf ${VENVS_PATH}/{launcher & mlproject & spark_inference}/* - (y/n)?" choice
19 | case "$choice" in
20 | y | Y)
21 |   rm -rf "${VENVS_PATH:?}"/launcher/*
22 |   rm -rf "${VENVS_PATH:?}"/mlproject/*
23 |   rm -rf "${VENVS_PATH:?}"/spark_inference/*
24 |   ;;
25 | *)
26 |   echo "Skipping"
27 |   ;;
28 | esac
29 | 
30 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/create_vEnv_mlproject_command.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | CONDA_YAML_PATH=$WD/mlproject/conda.yaml
 5 | VENV_PATH=$WD/venvs/mlproject
 6 | 
 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}"
 8 | 
 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH"
10 | 
11 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/create_vEnv_spark_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WD="$(dirname "$(readlink -f "$0")")"
 4 | CONDA_YAML_PATH=$WD/spark_inference/conda.yaml
 5 | VENV_PATH=$WD/venvs/spark_inference
 6 | 
 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}"
 8 | 
 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH"
10 | 
11 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/data/train.csv:
--------------------------------------------------------------------------------
1 | class
2 | a
3 | b
4 | d
5 | d


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/doc/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/doc/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/launcher_mlproject_command.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mlflow
 3 | from mlflow.tracking import MlflowClient
 4 | from mlproject import train
 5 | 
 6 | # --------------------------------------------------------------------------------
 7 | # Note: This python script must be launched using the virtual environment
 8 | #       defined in MLproject through conda.yaml file:
 9 | #
10 | # Note: This v.env has been pre-created at venvs/mlproject
11 | # --------------------------------------------------------------------------------
12 | 
13 | # Current directory
14 | wd = os.path.abspath(os.path.dirname(__file__))
15 | 
16 | # Reading data ~ Rocket setup
17 | input_csv = os.path.join(wd, 'data', 'train.csv')
18 | 
19 | # Creating experiment ~ Mlflow launcher
20 | mlflow.set_tracking_uri("file://{}".format(os.path.join(wd, 'mlflow_runs/')))
21 | client = MlflowClient(
22 |     tracking_uri="file://{}".format(os.path.join(wd, 'mlflow_runs'))
23 | )
24 | experiment_name = "local"
25 | if not client.get_experiment_by_name(experiment_name):
26 |     client.create_experiment(experiment_name)
27 | 
28 | 
29 | # Executing command defined in MLproject file ~ Mlflow launcher
30 | train.main(
31 |     [
32 |         "--training_data={}".format(input_csv),
33 |         "--feature_column_name={}".format("class"),
34 |         "--prediction_column_name={}".format("prediction")
35 |     ]
36 | )
37 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/launcher_spark_inference.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # => Working directory
 5 | wd = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | # => Setting spark environment ~ Rocket integration
 8 | if not os.getenv('SPARK_HOME'):
 9 |     os.environ['SPARK_HOME'] = "/home/asoriano/workspace/software/stratio-spark-distribution-3.1.1-1.2.0-766b881-bin"
10 | spark_home = os.environ.get('SPARK_HOME', None)
11 | # Add pyspark and py4j to path.
12 | sys.path.insert(0, spark_home + "/python")
13 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.9-src.zip'))
14 | 
15 | from pyspark.sql import SparkSession
16 | import mlflow
17 | 
18 | # => Creating a pyspark session ~ Rocket integration
19 | spark = SparkSession.builder.master("local[*]")\
20 |     .appName("Debugging Spark-Mlflow integration") \
21 |     .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
22 |     .getOrCreate()
23 | 
24 | # => Reading data ~ Rocket integration
25 | df = spark.read.csv(
26 |     path=os.path.join(wd, 'data', 'train.csv'),
27 |     header=True,
28 |     inferSchema=True
29 | )
30 | 
31 | # => Mlflow logged model path ~ Rocket integration
32 | modelDirPath = os.path.join(wd, 'spark_inference', 'model')
33 | # · Loading model
34 | loaded_model = mlflow.pyfunc.load_model(modelDirPath)
35 | 
36 | 
37 | # => Constructing UDF ~ Rocket integration
38 | #     · We need input features, output column name and output column type
39 | features = None
40 | output_spark_schema = None
41 | 
42 | # Try to use model signature to infer this parameters
43 | if loaded_model.metadata.signature:
44 |     # Input features
45 |     input_signature = loaded_model.metadata.signature.inputs
46 |     features = [s.name for s in input_signature.inputs]
47 |     # Output column name & type
48 |     output_signature = loaded_model.metadata.signature.outputs
49 |     output_spark_schema = output_signature.as_spark_schema()
50 | 
51 | 
52 | # · Input features
53 | if not features:
54 |     features = []
55 | print("Input features for UDF: {}".format(features))
56 | 
57 | # · Output column name & type
58 | if not output_spark_schema:
59 |     predictionColumnName = "prediction"
60 |     predictionColumnType = "string"
61 | else:
62 |     print("Spark schema: {}".format(output_spark_schema))
63 |     predictionColumnName = output_spark_schema[0].name
64 |     predictionColumnType = output_spark_schema[0].dataType
65 | 
66 | print("Prediction column name for UDF: {}".format(predictionColumnName))
67 | print("Prediction column type for UDF: {}".format(predictionColumnType))
68 | 
69 | prediction_udf = mlflow.pyfunc.spark_udf(spark, modelDirPath, result_type=predictionColumnType)
70 | 
71 | # => Making predictions ~ Rocket integration
72 | predictionDf = df.withColumn(predictionColumnName, prediction_udf(*features))
73 | predictionDf.show()
74 | 
75 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/mlflow_runs/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/mlflow_runs/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/MLproject:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       training_data: string
 9 |       feature_column_name: {type: string, default: class}
10 |       prediction_column_name: {type: string, default: prediction}
11 | 
12 |       output_column_name: {type: string, default: prediction}
13 |       output_column_type: {type: string, default: string }
14 | 
15 |     command: python train.py --training_data={training_data} --feature_column_name={feature_column_name} --prediction_column_name={prediction_column_name}


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/__init__.py


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/conda.yaml:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------
 2 | #  Definition file for a python Conda managed virtual environment
 3 | # -----------------------------------------------------------------
 4 | 
 5 | # V.env name
 6 | name: test
 7 | 
 8 | # Conda channels (repositories) used to retrieve python packages
 9 | channels:
10 |   - defaults
11 | 
12 | # Dependencies
13 | dependencies:
14 |   # From conda repositories (channels)
15 |   - python=3.7.6
16 |   - pip=20.2.2 # <-- pip: python package manager
17 |   - pip:
18 |       # Python packages managed with Pip instead of Conda
19 |       - mlflow==1.15.0


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import pandas as pd
 4 | import numpy as np
 5 | import mlflow.pyfunc
 6 | from mlflow.models.signature import infer_signature
 7 | from mlflow.pyfunc import PythonModel
 8 | 
 9 | 
10 | def parse_args(argv):
11 |     """
12 |     Parses python command line input arguments (defined in MLproject file at command section)
13 |     """
14 |     parser = argparse.ArgumentParser(description='Mlflow example')
15 |     parser.add_argument('--training_data', type=str, help='training data set in csv')
16 |     parser.add_argument('--feature_column_name', type=str, help='')
17 |     parser.add_argument('--prediction_column_name', type=str, help='')
18 |     return parser.parse_args(argv)
19 | 
20 | 
21 | class CustomModel(PythonModel):
22 | 
23 |     def __init__(self, feature_col_name, prediction_col_name):
24 |         self.feature_col_name = feature_col_name
25 |         self.prediction_col_name = prediction_col_name
26 | 
27 |     def dummy_func(self, x):
28 |         return "Dummy code - {}".format(str(x))
29 | 
30 |     def predict(self, context, model_input):
31 |         if isinstance(model_input, pd.DataFrame):
32 |             return pd.DataFrame(
33 |                 np.vectorize(self.dummy_func)(model_input[self.feature_col_name]), columns=[self.prediction_col_name]
34 |             )
35 |         else:
36 |             raise TypeError("Only DataFrame input types are supported")
37 | 
38 | 
39 | def main(argv):
40 |     """ Data """
41 |     args = parse_args(argv)
42 |     pd_data = pd.read_csv(args.training_data)
43 | 
44 |     # Features
45 |     X_train = pd_data[[args.feature_column_name]]
46 | 
47 |     """ Model """
48 |     model = CustomModel(args.feature_column_name, args.prediction_column_name)
49 | 
50 |     # Predictions
51 |     y_pred = model.predict({}, X_train)
52 | 
53 |     signature = infer_signature(X_train, y_pred)
54 |     print("Signature: {}".format(signature))
55 | #   print("Input as spark schema: {}".format(signature.inputs.as_spark_schema()))
56 | #   print("Output as spark schema: {}".format(signature.outputs.as_spark_schema()))
57 | 
58 |     """ Tracking """
59 |     with mlflow.start_run() as run:
60 |         print('MFlown run {}'.format(run.info))
61 |         mlflow.pyfunc.log_model("model", python_model=model, signature=signature)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main(sys.argv[1:])
66 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/conda.yaml:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------------------------------------------
 2 | # Python V. env to be used in pySpark in order to make predictions with a python_flavour MlFlow model
 3 | #
 4 | # Example:
 5 | #   name: mlflow-env
 6 | #   channels:
 7 | #   - defaults
 8 | #   - conda-forge
 9 | #   dependencies:
10 | #   - python=3.7.6
11 | #   - pip=20.2.2
12 | #   - pip:
13 | #     - mlflow==1.15.0
14 | #     - scikit-learn==0.22.1
15 | #     - cloudpickle==2.0.0
16 | #     - pyarrow==5.0.0
17 | #
18 | # Note:
19 | #   · Pre-requisite: binary distribution of Spark (SPARK_HOME env. var pointing to spark directory)
20 | #   · pyspark and py4j dependencies are provided at runtime (getting them from Spark binary distribution)
21 | #   · pyarrow is necessary in order to use Pandas UDF in Spark --> Mlflow do not include in it MLModel conda.yaml
22 | # -----------------------------------------------------------------------------------------------------------------
23 | 
24 | name: mlflow-env
25 | 
26 | channels:
27 | - defaults
28 | - conda-forge
29 | 
30 | dependencies:
31 | - python=3.7.6
32 | - pip=20.2.2
33 | - pip:
34 |   - mlflow==1.15.0
35 |   - cloudpickle==2.0.0
36 |   - pyarrow==5.0.0


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/MLmodel:
--------------------------------------------------------------------------------
 1 | artifact_path: model
 2 | flavors:
 3 |   python_function:
 4 |     cloudpickle_version: 2.0.0
 5 |     env: conda.yaml
 6 |     loader_module: mlflow.pyfunc.model
 7 |     python_model: python_model.pkl
 8 |     python_version: 3.7.6
 9 | run_id: c874e2ead8864acaa377d6252e03277b
10 | signature:
11 |   inputs: '[{"name": "class", "type": "string"}]'
12 |   outputs: '[{"name": "prediction", "type": "string"}]'
13 | utc_time_created: '2021-10-27 08:13:47.929231'
14 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/conda.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - defaults
 3 | - conda-forge
 4 | dependencies:
 5 | - python=3.7.6
 6 | - pip
 7 | - pip:
 8 |   - mlflow
 9 |   - cloudpickle==2.0.0
10 | name: mlflow-env
11 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/python_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/python_model.pkl


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/utils/install_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # · Install conda 4.8.3
 4 | CONDA_DIR=/opt/conda
 5 | 
 6 | curl -LO http://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh
 7 | mkdir -p $CONDA_DIR
 8 | bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -f -b -p $CONDA_DIR
 9 | rm Miniconda3-py37_4.8.3-Linux-x86_64.sh
10 | conda install --quiet --yes conda==4.8.3
11 | 
12 | # · Configuring conda
13 | conda config --system --set auto_update_conda false
14 | 


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/launcher/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/launcher/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/mlproject/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/mlproject/.keep


--------------------------------------------------------------------------------
/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/spark_inference/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/spark_inference/.keep


--------------------------------------------------------------------------------
/rocket-extensions/Readme.md:
--------------------------------------------------------------------------------
 1 | # Rocket extensions
 2 | 
 3 | ## UDF
 4 | 
 5 | // TODO
 6 | 
 7 | ## Plugins
 8 | 
 9 | - Legacy versions (Documentation in branches older than 9.5 links it)  
10 |     - /old-extensions/input-lite-xd  
11 |     - /old-extensions/output-lite-xd  
12 |     - /old-extensions/transformation-lite-xd
13 | 
14 | **Note: plugins generated with new SDK versions must be included in new folders:**
15 | 
16 | - rocket-1.0.0-SDK  
17 |     - Custom Input: GeneratorXDLiteInputStepBatch & GeneratorXDLiteInputStepStreaming
18 |     - Custom Transform: RepartitionXDLiteTransformStepBatch & RepartitionXDLiteTransformStepStreaming
19 |     - Custom Transform: TokenizerTransformStepBatch & TokenizerTransformStepStreaming
20 |     - Custom Output: LoggerXDLiteOutputStep
21 | 
22 | - rocket-1.1.0-SDK      
23 |     - New functionality: metadata management 
24 |     - New step: MetadataTestXDLiteInputStepBatch
25 | 
26 | - rocket-2.2.0-SDK  
27 |     - New functionality: Execution report logs in custom steps
28 |     - New steps: ReportLogTestXDLiteInputStepBatch & ReportLogTestXDLiteInputStepStreaming 
29 |     - Added reporting in:  
30 |         - GeneratorXDLiteInputStepBatch & GeneratorXDLiteInputStepStreaming
31 |         - TokenizerTransformStepBatch & TokenizerTransformStepStreaming
32 |         - LoggerXDLiteOutputStep
33 | 
34 | - rocket-2.3.0-SDK  
35 |     - New functionality: Lineage and QRs definition in custom steps
36 |     - New steps: JdbcWithLineageXDLiteOutputStep
37 | 
38 | - rocket-3.0.0-SDK  
39 |     - New functionality: Hybrid custom steps
40 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     ResultBatchData(defaultRDD, Option(stringSchema))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.rdd.RDD
11 | import org.apache.spark.sql.functions._
12 | import com.stratio.sparta.sdk.lite.xd.streaming._
13 | import com.stratio.sparta.sdk.lite.streaming.models._
14 | import org.apache.spark.sql.crossdata.XDSession
15 | 
16 | import scala.util.{Failure, Success, Try}
17 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
18 | import org.apache.spark.sql.types._
19 | import org.apache.spark.streaming.StreamingContext
20 | 
21 | import scala.collection.mutable
22 | 
23 | class GeneratorXDLiteInputStepStreaming(
24 |                                        xdSession: XDSession,
25 |                                        streamingContext: StreamingContext,
26 |                                        properties: Map[String, String]
27 |                                      )
28 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
29 | 
30 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
31 | 
32 |   override def init(): ResultStreamingData = {
33 |     val dataQueue = new mutable.Queue[RDD[Row]]()
34 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
35 |     dataQueue += xdSession.sparkContext.parallelize(register)
36 |     val stream = streamingContext.queueStream(dataQueue)
37 | 
38 |     ResultStreamingData(stream, Option(stringSchema))
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       logger.info(s"Table name: $tableName")
36 |       logger.info(s"Save mode is set to ${outputOptions.saveMode}")
37 |     }
38 |     data.foreach{ row =>
39 |       println(row.mkString(","))
40 |     }
41 |   }
42 | 
43 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
44 | }
45 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package com.stratio.sparta
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.catalyst.ScalaReflection
11 | import org.apache.spark.sql.expressions.UserDefinedFunction
12 | import org.apache.spark.sql.types.StringType
13 | 
14 | import scala.util.Try
15 | 
16 | case class ToUpperCaseUDF() extends SpartaUDF {
17 | 
18 |   val name = "uppercaseSparta"
19 | 
20 |   val upper: String => String = _.toUpperCase
21 | 
22 |   val userDefinedFunction: UserDefinedFunction =
23 |     UserDefinedFunction(upper , StringType, Option(Seq(StringType)))
24 | }
25 | 
26 | case class ConcatUDF() extends SpartaUDF {
27 | 
28 |   val name = "concatSparta"
29 | 
30 |   val upper: (String, String) => String =  { case (str1, str2) =>
31 |     s"$str1/$str2"
32 |   }
33 | 
34 |   val userDefinedFunction: UserDefinedFunction =
35 |     UserDefinedFunction(upper , StringType, Option(Seq(StringType, StringType)))
36 | }
37 | 
38 | case class ToUpperCaseWithReflectionUDF() extends SpartaUDF {
39 | 
40 |   val name = "upperCaseReflect"
41 | 
42 |   val upper: String => String = _.toUpperCase
43 | 
44 |   val userDefinedFunction: UserDefinedFunction = {
45 |     val inputTypes = Try(ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType :: Nil).toOption
46 |     UserDefinedFunction(upper , ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType, inputTypes)
47 |   }
48 | }


--------------------------------------------------------------------------------
/rocket-extensions/old-extensions/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.sql.sparta.udf
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.sql.types.{DoubleType, IntegerType}
12 | import org.apache.spark.ml.linalg.VectorUDT
13 | 
14 | case object VectorUDT extends VectorUDT
15 | 
16 | case class GetDenseVectorUDF() extends SpartaUDF {
17 | 
18 |   val name = "get_vector_ith_element"
19 | 
20 |   val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num)
21 | 
22 |   val userDefinedFunction: UserDefinedFunction =
23 |     UserDefinedFunction(getVectorElement , DoubleType, Option(Seq(VectorUDT, IntegerType)))
24 | }
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     ResultBatchData(defaultRDD, Option(stringSchema))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.rdd.RDD
11 | import org.apache.spark.sql.functions._
12 | import com.stratio.sparta.sdk.lite.xd.streaming._
13 | import com.stratio.sparta.sdk.lite.streaming.models._
14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
15 | import org.apache.spark.sql.crossdata.XDSession
16 | 
17 | import scala.util.{Failure, Success, Try}
18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.streaming.StreamingContext
21 | 
22 | import scala.collection.mutable
23 | 
24 | class GeneratorXDLiteInputStepStreaming(
25 |                                        xdSession: XDSession,
26 |                                        streamingContext: StreamingContext,
27 |                                        properties: Map[String, String]
28 |                                      )
29 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
30 | 
31 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
32 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
33 | 
34 | 
35 |   override def validate(): ValidationResult = {
36 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
37 | 
38 |     if (rawData.isEmpty) {
39 |       validation = ValidationResult(
40 |         valid = false,
41 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
42 |     }
43 | 
44 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
45 |       validation = ValidationResult(
46 |         valid = false,
47 |         messages = validation.messages :+ "Generated data cannot be an empty string")
48 |     }
49 |     validation
50 |   }
51 | 
52 |   override def init(): ResultStreamingData = {
53 |     val dataQueue = new mutable.Queue[RDD[Row]]()
54 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
55 |     dataQueue += xdSession.sparkContext.parallelize(register)
56 |     val stream = streamingContext.queueStream(dataQueue)
57 | 
58 |     ResultStreamingData(stream, Option(stringSchema))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       logger.info(s"Table name: $tableName")
36 |       logger.info(s"Save mode is set to ${outputOptions.saveMode}")
37 |     }
38 |     data.foreach{ row =>
39 |       println(row.mkString(","))
40 |     }
41 |   }
42 | 
43 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
44 | }
45 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-input-xd-1.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-input-xd-1.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-output-xd-1.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-output-xd-1.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.1.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     ResultBatchData(defaultRDD, Option(stringSchema))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-1.1.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.rdd.RDD
11 | import org.apache.spark.sql.functions._
12 | import com.stratio.sparta.sdk.lite.xd.streaming._
13 | import com.stratio.sparta.sdk.lite.streaming.models._
14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
15 | import org.apache.spark.sql.crossdata.XDSession
16 | 
17 | import scala.util.{Failure, Success, Try}
18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.streaming.StreamingContext
21 | 
22 | import scala.collection.mutable
23 | 
24 | class GeneratorXDLiteInputStepStreaming(
25 |                                        xdSession: XDSession,
26 |                                        streamingContext: StreamingContext,
27 |                                        properties: Map[String, String]
28 |                                      )
29 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
30 | 
31 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
32 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
33 | 
34 | 
35 |   override def validate(): ValidationResult = {
36 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
37 | 
38 |     if (rawData.isEmpty) {
39 |       validation = ValidationResult(
40 |         valid = false,
41 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
42 |     }
43 | 
44 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
45 |       validation = ValidationResult(
46 |         valid = false,
47 |         messages = validation.messages :+ "Generated data cannot be an empty string")
48 |     }
49 |     validation
50 |   }
51 | 
52 |   override def init(): ResultStreamingData = {
53 |     val dataQueue = new mutable.Queue[RDD[Row]]()
54 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
55 |     dataQueue += xdSession.sparkContext.parallelize(register)
56 |     val stream = streamingContext.queueStream(dataQueue)
57 | 
58 |     ResultStreamingData(stream, Option(stringSchema))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     // · Reporting messages
48 |     reportInfoLog(phase="init", s"Generated data: $register")
49 | 
50 |     ResultBatchData(defaultRDD, Option(stringSchema))
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.rdd.RDD
11 | import org.apache.spark.sql.functions._
12 | import com.stratio.sparta.sdk.lite.xd.streaming._
13 | import com.stratio.sparta.sdk.lite.streaming.models._
14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
15 | import org.apache.spark.sql.crossdata.XDSession
16 | 
17 | import scala.util.{Failure, Success, Try}
18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.streaming.StreamingContext
21 | 
22 | import scala.collection.mutable
23 | 
24 | class GeneratorXDLiteInputStepStreaming(
25 |                                        xdSession: XDSession,
26 |                                        streamingContext: StreamingContext,
27 |                                        properties: Map[String, String]
28 |                                      )
29 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
30 | 
31 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
32 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
33 | 
34 |   override def validate(): ValidationResult = {
35 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
36 | 
37 |     if (rawData.isEmpty) {
38 |       validation = ValidationResult(
39 |         valid = false,
40 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
41 |     }
42 | 
43 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
44 |       validation = ValidationResult(
45 |         valid = false,
46 |         messages = validation.messages :+ "Generated data cannot be an empty string")
47 |     }
48 |     validation
49 |   }
50 | 
51 |   override def init(): ResultStreamingData = {
52 |     val dataQueue = new mutable.Queue[RDD[Row]]()
53 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
54 |     dataQueue += xdSession.sparkContext.parallelize(register)
55 |     val stream = streamingContext.queueStream(dataQueue)
56 | 
57 |     // · Reporting messages
58 |     reportInfoLog(phase="init", s"Generated data: $register")
59 | 
60 |     ResultStreamingData(stream, Option(stringSchema))
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class ReportLogTestXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     // · Reporting messages
46 |     reportInfoLog(phase="init", s"Generated data: $register")
47 |     reportWarnLog(phase="init", s"Generated data: $register")
48 |     reportErrorLog(phase="init", s"Generated data: $register")
49 | 
50 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
51 | 
52 |     ResultBatchData(defaultRDD, Option(stringSchema))
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.streaming._
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql._
13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
14 | import org.apache.spark.sql.crossdata.XDSession
15 | import org.apache.spark.sql.types._
16 | import org.apache.spark.streaming.StreamingContext
17 | 
18 | import scala.collection.mutable
19 | 
20 | class ReportLogTestXDLiteInputStepStreaming(
21 |                                        xdSession: XDSession,
22 |                                        streamingContext: StreamingContext,
23 |                                        properties: Map[String, String]
24 |                                      )
25 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
26 | 
27 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
28 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
29 | 
30 |   override def validate(): ValidationResult = {
31 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
32 | 
33 |     if (rawData.isEmpty) {
34 |       validation = ValidationResult(
35 |         valid = false,
36 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
37 |     }
38 | 
39 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
40 |       validation = ValidationResult(
41 |         valid = false,
42 |         messages = validation.messages :+ "Generated data cannot be an empty string")
43 |     }
44 |     validation
45 |   }
46 | 
47 |   override def init(): ResultStreamingData = {
48 |     val dataQueue = new mutable.Queue[RDD[Row]]()
49 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
50 |     dataQueue += xdSession.sparkContext.parallelize(register)
51 |     val stream = streamingContext.queueStream(dataQueue)
52 | 
53 |     // · Reporting messages
54 |     reportInfoLog(phase="init", s"Generated data: $register")
55 |     reportWarnLog(phase="init", s"Generated data: $register")
56 |     reportErrorLog(phase="init", s"Generated data: $register")
57 | 
58 |     ResultStreamingData(stream, Option(stringSchema))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       val info1 = s"Table name: $tableName"
36 |       logger.info(info1)
37 |       // · Reporting messages
38 |       reportInfoLog(phase="init", msg = info1)
39 | 
40 |       val info2 = s"Save mode is set to ${outputOptions.saveMode}"
41 |       logger.info(info2)
42 |       // · Reporting messages
43 |       reportInfoLog(phase="init", msg = info2)
44 |     }
45 |     data.foreach{ row =>
46 |       println(row.mkString(","))
47 |     }
48 |   }
49 | 
50 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
51 | }
52 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-input-xd-2.2.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-input-xd-2.2.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-output-xd-2.2.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-output-xd-2.2.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-udf-2.2.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-udf-2.2.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.column
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.xd.batch._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.sql.functions.lit
12 | import org.apache.spark.sql.types.StructType
13 | 
14 | class AddColumnXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     // Get input data and schema
21 |     val inputStream = inputData.head._2.data
22 |     val inputSchema = inputData.head._2.schema.getOrElse(new StructType())
23 | 
24 |     // Convert to DataFrame and make modifications
25 |     val df = xdSession.createDataFrame(inputStream, inputSchema)
26 |     val dfWithColumn = df.withColumn("newCol", lit(2))
27 | 
28 |     // Return the transformed data
29 |     OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema))
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package com.stratio.sparta
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.sql.functions.udf
12 | 
13 | case class ToUpperCaseUDF() extends SpartaUDF {
14 | 
15 |   val name = "uppercaseSparta"
16 | 
17 |   val upper: String => String = _.toUpperCase
18 | 
19 |   val userDefinedFunction: UserDefinedFunction = udf(upper)
20 | 
21 | }
22 | 
23 | case class ConcatUDF() extends SpartaUDF {
24 | 
25 |   val name = "concatSparta"
26 | 
27 |   val concat: (String, String) => String =  { case (str1, str2) =>
28 |     s"$str1/$str2"
29 |   }
30 | 
31 |   val userDefinedFunction: UserDefinedFunction = udf(concat)
32 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.2.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.sql.sparta.udf
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.ml.linalg.VectorUDT
12 | import org.apache.spark.sql.functions.udf
13 | 
14 | case object VectorUDT extends VectorUDT
15 | 
16 | case class GetDenseVectorUDF() extends SpartaUDF {
17 | 
18 |   val name = "get_vector_ith_element"
19 | 
20 |   val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num)
21 | 
22 |   val userDefinedFunction: UserDefinedFunction = udf(getVectorElement)
23 | }
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     // · Reporting messages
48 |     reportInfoLog(phase="init", s"Generated data: $register")
49 | 
50 |     ResultBatchData(defaultRDD, Option(stringSchema))
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class ReportLogTestXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     // · Reporting messages
46 |     reportInfoLog(phase="init", s"Generated data: $register")
47 |     reportWarnLog(phase="init", s"Generated data: $register")
48 |     reportErrorLog(phase="init", s"Generated data: $register")
49 | 
50 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
51 | 
52 |     ResultBatchData(defaultRDD, Option(stringSchema))
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.streaming._
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql._
13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
14 | import org.apache.spark.sql.crossdata.XDSession
15 | import org.apache.spark.sql.types._
16 | import org.apache.spark.streaming.StreamingContext
17 | 
18 | import scala.collection.mutable
19 | 
20 | class ReportLogTestXDLiteInputStepStreaming(
21 |                                        xdSession: XDSession,
22 |                                        streamingContext: StreamingContext,
23 |                                        properties: Map[String, String]
24 |                                      )
25 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
26 | 
27 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
28 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
29 | 
30 |   override def validate(): ValidationResult = {
31 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
32 | 
33 |     if (rawData.isEmpty) {
34 |       validation = ValidationResult(
35 |         valid = false,
36 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
37 |     }
38 | 
39 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
40 |       validation = ValidationResult(
41 |         valid = false,
42 |         messages = validation.messages :+ "Generated data cannot be an empty string")
43 |     }
44 |     validation
45 |   }
46 | 
47 |   override def init(): ResultStreamingData = {
48 |     val dataQueue = new mutable.Queue[RDD[Row]]()
49 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
50 |     dataQueue += xdSession.sparkContext.parallelize(register)
51 |     val stream = streamingContext.queueStream(dataQueue)
52 | 
53 |     // · Reporting messages
54 |     reportInfoLog(phase="init", s"Generated data: $register")
55 |     reportWarnLog(phase="init", s"Generated data: $register")
56 |     reportErrorLog(phase="init", s"Generated data: $register")
57 | 
58 |     ResultStreamingData(stream, Option(stringSchema))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/JdbcWithLineageXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import java.util.Properties
14 | import scala.util.{Failure, Success, Try}
15 | 
16 | class JdbcWithLineageXDLiteOutputStep(
17 |                               xdSession: XDSession,
18 |                               properties: Map[String, String]
19 |                             )
20 |   extends LiteCustomXDOutput(xdSession, properties) {
21 | 
22 |   lazy val url = properties.getOrElse("url", throw new NoSuchElementException("The url property is mandatory"))
23 | 
24 |   // Lineage options, usually extracted from 'url' or other properties as 'dbtable'
25 |   override def lineageService(): Option[String] = properties.get("service")
26 |   override def lineagePath(): Option[String] = properties.get("path")
27 |   override def lineageResource(): Option[String] = properties.get("resource") // If empty will be populated by the system the writer tableName
28 |   override def lineageDatastoreType(): Option[String] = properties.get("datastoreType")
29 | 
30 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
31 |     val tableName = outputOptions.tableName.getOrElse{
32 |       logger.error("Table name not defined")
33 |       throw new NoSuchElementException("tableName not found in options")
34 |     }
35 | 
36 |     val jdbcProperties = new Properties()
37 | 
38 |     properties
39 |       .filterKeys(key => key.startsWith("jdbc_") || key.equals("driver"))
40 |       .foreach{ case (key, value) => jdbcProperties.put(key.replaceAll("jdbc_", ""), value) }
41 | 
42 |     logger.error(s"Connecting with table $tableName")
43 |     logger.error(s"Connecting with properties $jdbcProperties")
44 | 
45 |     //data.write
46 |     // .mode(SaveMode.Append)
47 |     // .jdbc(url = url, table = tableName, connectionProperties = jdbcProperties)
48 |     data.count()
49 |   }
50 | 
51 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
52 | }
53 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       val info1 = s"Table name: $tableName"
36 |       logger.info(info1)
37 |       // · Reporting messages
38 |       reportInfoLog(phase="init", msg = info1)
39 | 
40 |       val info2 = s"Save mode is set to ${outputOptions.saveMode}"
41 |       logger.info(info2)
42 |       // · Reporting messages
43 |       reportInfoLog(phase="init", msg = info2)
44 |     }
45 |     data.foreach{ row =>
46 |       println(row.mkString(","))
47 |     }
48 |   }
49 | 
50 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
51 | }
52 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.column
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.xd.batch._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.sql.functions.lit
12 | import org.apache.spark.sql.types.StructType
13 | 
14 | class AddColumnXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     // Get input data and schema
21 |     val inputStream = inputData.head._2.data
22 |     val inputSchema = inputData.head._2.schema.getOrElse(new StructType())
23 | 
24 |     // Convert to DataFrame and make modifications
25 |     val df = xdSession.createDataFrame(inputStream, inputSchema)
26 |     val dfWithColumn = df.withColumn("newCol", lit(2))
27 | 
28 |     // Return the transformed data
29 |     OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema))
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package com.stratio.sparta
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.sql.functions.udf
12 | 
13 | case class ToUpperCaseUDF() extends SpartaUDF {
14 | 
15 |   val name = "uppercaseSparta"
16 | 
17 |   val upper: String => String = _.toUpperCase
18 | 
19 |   val userDefinedFunction: UserDefinedFunction = udf(upper)
20 | 
21 | }
22 | 
23 | case class ConcatUDF() extends SpartaUDF {
24 | 
25 |   val name = "concatSparta"
26 | 
27 |   val concat: (String, String) => String =  { case (str1, str2) =>
28 |     s"$str1/$str2"
29 |   }
30 | 
31 |   val userDefinedFunction: UserDefinedFunction = udf(concat)
32 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-2.3.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.sql.sparta.udf
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.ml.linalg.VectorUDT
12 | import org.apache.spark.sql.functions.udf
13 | 
14 | case object VectorUDT extends VectorUDT
15 | 
16 | case class GetDenseVectorUDF() extends SpartaUDF {
17 | 
18 |   val name = "get_vector_ith_element"
19 | 
20 |   val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num)
21 | 
22 |   val userDefinedFunction: UserDefinedFunction = udf(getVectorElement)
23 | }
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/Readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ## Inputs classes
 4 | 
 5 | ### Batch 
 6 | 
 7 | com.stratio.sparta.GeneratorXDLiteInputStepBatch
 8 | 
 9 | com.stratio.sparta.MetadataTestXDLiteInputStepBatch
10 | 
11 | com.stratio.sparta.ReportLogTestXDLiteInputStepBatch
12 | 
13 | ### Streaming
14 | 
15 | com.stratio.sparta.GeneratorXDLiteInputStepStreaming
16 | 
17 | com.stratio.sparta.ReportLogTestXDLiteInputStepStreaming
18 | 
19 | ### Hybrid 
20 | 
21 | com.stratio.sparta.GeneratorXDLiteInputStepHybrid
22 | 
23 | com.stratio.sparta.StreamGeneratorXDLiteInputStepHybrid
24 | 
25 | 
26 | ## Transformer classes
27 | 
28 | ### Batch 
29 | 
30 | com.stratio.sparta.transformations.column.AddColumnXDLiteTransformStepBatch
31 | 
32 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepBatch
33 | 
34 | com.stratio.sparta.transformations.tokenizer.TokenizerTransformStepBatch
35 | 
36 | ### Streaming
37 | 
38 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepStreaming
39 | 
40 | com.stratio.sparta.transformations.tokenizer.TokenizerTransformStepStreaming
41 | 
42 | ### Hybrid 
43 | 
44 | com.stratio.sparta.transformations.column.AddColumnXDLiteTransformStepHybrid
45 | 
46 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepHybrid
47 | 
48 | 
49 | ## Output classes
50 | 
51 | com.stratio.sparta.LoggerXDLiteOutputStep
52 | 
53 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         warnings = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     // · Reporting messages
48 |     reportInfoLog(phase="init", s"Generated data: $register")
49 | 
50 |     ResultBatchData(defaultRDD, Option(stringSchema))
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepHybrid.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.hybrid.models.ResultHybridData
10 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
11 | import com.stratio.sparta.sdk.lite.xd.hybrid._
12 | import org.apache.spark.sql._
13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
14 | import org.apache.spark.sql.crossdata.XDSession
15 | import org.apache.spark.sql.types._
16 | 
17 | class GeneratorXDLiteInputStepHybrid(
18 |                                    xdSession: XDSession,
19 |                                    properties: Map[String, String]
20 |                                  )
21 |   extends LiteCustomXDHybridInput(xdSession, properties) {
22 | 
23 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
24 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
25 | 
26 | 
27 |   override def validate(): ValidationResult = {
28 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
29 | 
30 |     if (rawData.isEmpty) {
31 |       validation = ValidationResult(
32 |         valid = false,
33 |         warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
34 |     }
35 | 
36 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
37 |       validation = ValidationResult(
38 |         valid = false,
39 |         warnings = validation.messages :+ "Generated data cannot be an empty string")
40 |     }
41 |     validation
42 |   }
43 | 
44 |   override def init(): ResultHybridData = {
45 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
46 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
47 |     val dataFrame = xdSession.createDataFrame(defaultRDD, stringSchema)
48 | 
49 |     // · Reporting messages
50 |     reportInfoLog(phase="init", s"Generated data: $register")
51 | 
52 |     ResultHybridData(dataFrame)
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class ReportLogTestXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         warnings = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
45 |     // · Reporting messages
46 |     reportInfoLog(phase="init", s"Generated data: $register")
47 |     reportWarnLog(phase="init", s"Generated data: $register")
48 |     reportErrorLog(phase="init", s"Generated data: $register")
49 | 
50 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
51 | 
52 |     ResultBatchData(defaultRDD, Option(stringSchema))
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.streaming._
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql._
13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
14 | import org.apache.spark.sql.crossdata.XDSession
15 | import org.apache.spark.sql.types._
16 | import org.apache.spark.streaming.StreamingContext
17 | 
18 | import scala.collection.mutable
19 | 
20 | class ReportLogTestXDLiteInputStepStreaming(
21 |                                        xdSession: XDSession,
22 |                                        streamingContext: StreamingContext,
23 |                                        properties: Map[String, String]
24 |                                      )
25 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
26 | 
27 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
28 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
29 | 
30 |   override def validate(): ValidationResult = {
31 |     var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty)
32 | 
33 |     if (rawData.isEmpty) {
34 |       validation = ValidationResult(
35 |         valid = false,
36 |         warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
37 |     }
38 | 
39 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
40 |       validation = ValidationResult(
41 |         valid = false,
42 |         warnings = validation.messages :+ "Generated data cannot be an empty string")
43 |     }
44 |     validation
45 |   }
46 | 
47 |   override def init(): ResultStreamingData = {
48 |     val dataQueue = new mutable.Queue[RDD[Row]]()
49 |     val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row])
50 |     dataQueue += xdSession.sparkContext.parallelize(register)
51 |     val stream = streamingContext.queueStream(dataQueue)
52 | 
53 |     // · Reporting messages
54 |     reportInfoLog(phase="init", s"Generated data: $register")
55 |     reportWarnLog(phase="init", s"Generated data: $register")
56 |     reportErrorLog(phase="init", s"Generated data: $register")
57 | 
58 |     ResultStreamingData(stream, Option(stringSchema))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/StreamGeneratorXDLiteInputStepHybrid.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.hybrid.models.ResultHybridData
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.hybrid._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
14 | import org.apache.spark.sql.crossdata.XDSession
15 | import org.apache.spark.sql.types._
16 | 
17 | class StreamGeneratorXDLiteInputStepHybrid(
18 |                                    xdSession: XDSession,
19 |                                    properties: Map[String, String]
20 |                                  )
21 |   extends LiteCustomXDHybridInput(xdSession, properties) {
22 | 
23 |   lazy val rowsPerSecond: Option[String] = properties.get("rowsPerSecond").map(_.toString)
24 | 
25 |   override def init(): ResultHybridData = {
26 |     val dataFrame: Dataset[Row] = xDSession.readStream
27 |       .format("rate")
28 |       .option("rowsPerSecond", rowsPerSecond.getOrElse("1"))
29 |       .load()
30 | 
31 |     ResultHybridData(dataFrame)
32 |   }
33 | 
34 |   // This method is used in order to provide an equivalent Batch Dataframe for debugging purposes
35 |   def debugInit(): Option[DataFrame] = {
36 |     import xdSession.implicits._
37 | 
38 |     Option(Seq(
39 |       (8, "Lazarillo de Tormes"),
40 |       (64, "Codex Seraphinianus"),
41 |       (27, "Divina Commedia")
42 |     ).toDF("price", "book"))
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/JdbcWithLineageXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import java.util.Properties
14 | import scala.util.{Failure, Success, Try}
15 | 
16 | class JdbcWithLineageXDLiteOutputStep(
17 |                               xdSession: XDSession,
18 |                               properties: Map[String, String]
19 |                             )
20 |   extends LiteCustomXDOutput(xdSession, properties) {
21 | 
22 |   lazy val url = properties.getOrElse("url", throw new NoSuchElementException("The url property is mandatory"))
23 | 
24 | 
25 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
26 |     val tableName = outputOptions.tableName.getOrElse{
27 |       logger.error("Table name not defined")
28 |       throw new NoSuchElementException("tableName not found in options")
29 |     }
30 | 
31 |     val jdbcProperties = new Properties()
32 | 
33 |     properties
34 |       .filterKeys(key => key.startsWith("jdbc_") || key.equals("driver"))
35 |       .foreach{ case (key, value) => jdbcProperties.put(key.replaceAll("jdbc_", ""), value) }
36 | 
37 |     logger.error(s"Connecting with table $tableName")
38 |     logger.error(s"Connecting with properties $jdbcProperties")
39 | 
40 |     data.write
41 |      .mode(SaveMode.Append)
42 |      .jdbc(url = url, table = tableName, connectionProperties = jdbcProperties)
43 |   }
44 | 
45 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
46 | }
47 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       val info1 = s"Table name: $tableName"
36 |       logger.info(info1)
37 |       // · Reporting messages
38 |       reportInfoLog(phase="init", msg = info1)
39 | 
40 |       val info2 = s"Save mode is set to ${outputOptions.saveMode}"
41 |       logger.info(info2)
42 |       // · Reporting messages
43 |       reportInfoLog(phase="init", msg = info2)
44 |     }
45 |     data.foreach{ row =>
46 |       println(row.mkString(","))
47 |     }
48 |   }
49 | 
50 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
51 | }
52 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-input-xd-3.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-input-xd-3.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-output-xd-3.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-output-xd-3.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.column
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.xd.batch._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.sql.functions.lit
12 | import org.apache.spark.sql.types.StructType
13 | 
14 | class AddColumnXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     // Get input data and schema
21 |     val inputStream = inputData.head._2.data
22 |     val inputSchema = inputData.head._2.schema.getOrElse(new StructType())
23 | 
24 |     // Convert to DataFrame and make modifications
25 |     val df = xdSession.createDataFrame(inputStream, inputSchema)
26 |     val dfWithColumn = df.withColumn("newCol", lit(2))
27 | 
28 |     // Return the transformed data
29 |     OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema))
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepHybrid.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.column
 7 | 
 8 | import com.stratio.sparta.sdk.lite.hybrid.models.{OutputHybridTransformData, ResultHybridData}
 9 | import com.stratio.sparta.sdk.lite.xd.hybrid.LiteCustomXDHybridTransform
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.sql.functions.lit
12 | 
13 | class AddColumnXDLiteTransformStepHybrid(
14 |                                           xdSession: XDSession,
15 |                                           properties: Map[String, String]
16 |                                         ) extends LiteCustomXDHybridTransform(xdSession, properties) {
17 | 
18 |   override def transform(inputData: Map[String, ResultHybridData]): OutputHybridTransformData = {
19 |     // Get input data and schema
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     val dfWithColumn = inputStream.withColumn("newCol", lit(2))
23 | 
24 |     // Return the transformed data
25 |     OutputHybridTransformData(dfWithColumn)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepHybrid.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.hybrid.models.{OutputHybridTransformData, ResultHybridData}
 9 | import com.stratio.sparta.sdk.lite.xd.hybrid.LiteCustomXDHybridTransform
10 | import org.apache.spark.sql.crossdata.XDSession
11 | 
12 | class RepartitionXDLiteTransformStepHybrid(
13 |                                             xdSession: XDSession,
14 |                                             properties: Map[String, String]
15 |                                           ) extends LiteCustomXDHybridTransform(xdSession, properties) {
16 | 
17 |   override def transform(inputData: Map[String, ResultHybridData]): OutputHybridTransformData = {
18 |     val inputStream = inputData.head._2.data
19 | 
20 |     OutputHybridTransformData(inputStream.repartition(5))
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package com.stratio.sparta
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.sql.functions.udf
12 | 
13 | case class ToUpperCaseUDF() extends SpartaUDF {
14 | 
15 |   val name = "uppercaseSparta"
16 | 
17 |   val upper: String => String = _.toUpperCase
18 | 
19 |   val userDefinedFunction: UserDefinedFunction = udf(upper)
20 | 
21 | }
22 | 
23 | case class ConcatUDF() extends SpartaUDF {
24 | 
25 |   val name = "concatSparta"
26 | 
27 |   val concat: (String, String) => String =  { case (str1, str2) =>
28 |     s"$str1/$str2"
29 |   }
30 | 
31 |   val userDefinedFunction: UserDefinedFunction = udf(concat)
32 | }


--------------------------------------------------------------------------------
/rocket-extensions/rocket-3.0.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.sql.sparta.udf
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.ml.linalg.VectorUDT
12 | import org.apache.spark.sql.functions.udf
13 | 
14 | case object VectorUDT extends VectorUDT
15 | 
16 | case class GetDenseVectorUDF() extends SpartaUDF {
17 | 
18 |   val name = "get_vector_ith_element"
19 | 
20 |   val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num)
21 | 
22 |   val userDefinedFunction: UserDefinedFunction = udf(getVectorElement)
23 | }
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/conda-pack-extensions/conda.yaml:
--------------------------------------------------------------------------------
1 | name: conda-pack-env
2 | 
3 | channels:
4 |   - conda-forge
5 |   - nodefaults
6 | 
7 | dependencies:
8 |   - python=3.9.7


--------------------------------------------------------------------------------
/rocket-python-extensions/conda-pack-extensions/do_conda_pack.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Parsing input parameters
 8 | while [[ $# -gt 0 ]]; do
 9 |   case "$1" in
10 |     -n)
11 |       env_name="$2"
12 |       ;;
13 |     *)
14 |       printf "***************************\n"
15 |       printf "* Error: Invalid argument.*\n"
16 |       printf "***************************\n"
17 |       exit 1
18 |   esac
19 |   shift
20 |   shift
21 | done
22 | 
23 | env_name=${env_name:-"my_env"}
24 | echo "Conda environment name: $env_name"
25 | 
26 | # Conda yaml path
27 | conda_yaml_path="$DIR/conda.yaml"
28 | 
29 | # Creating target directories
30 | target_dir="$DIR/target"
31 | target_conda_dir="$target_dir/$env_name"
32 | mkdir $target_dir
33 | 
34 | # Creating conda env
35 | conda env create -f $conda_yaml_path -p $target_conda_dir
36 | 
37 | # Packaging conda env
38 | packaged_conda_env="$target_dir/$env_name.tar.gz"
39 | 
40 | conda pack -p $target_conda_dir -o $packaged_conda_env
41 | 
42 | # Removing conda env
43 | conda env remove -p $target_conda_dir


--------------------------------------------------------------------------------
/rocket-python-extensions/private-pypi-repository/Readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Empaquetarlo en tar.gz
 3 | 
 4 | python3 setup.py sdist
 5 | 
 6 | # Usarlo en un workflow
 7 | 
 8 | Conda.yaml
 9 | 
10 | ```
11 | name: rocket-custom
12 | 
13 | channels:
14 |   - defaults
15 | 
16 | dependencies:
17 |   - python=3.7.6
18 |   - pip=20.2.2
19 |   - pip:
20 |       - mlflow==1.15.0
21 |       - pyspark==3.1.1
22 |       - pyarrow==5.0.0
23 |       - scikit-learn==0.22.1
24 |       - rocket-python-examples==0.1.0
25 | ```
26 | 
27 | En un step de PySpark:
28 | 
29 | ```
30 | from pyspark.sql import *
31 | from pyspark.sql.functions import *
32 | from pyspark.sql.types import *
33 | 
34 | def pyspark_transform(spark, df, param_dict):
35 | 
36 |     from rocket_python_examples.test import dummy_func
37 | 
38 |     convertUDF = udf(lambda z: dummy_func(z)) 
39 | 
40 |     return df.withColumn("driverPython", lit(dummy_func("python"))).withColumn("executorPython", convertUDF(df["class"]))
41 | ```
42 | 
43 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/private-pypi-repository/dist/rocket_python_examples-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/private-pypi-repository/dist/rocket_python_examples-0.1.0.tar.gz


--------------------------------------------------------------------------------
/rocket-python-extensions/private-pypi-repository/rocket_python_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/private-pypi-repository/rocket_python_examples/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/private-pypi-repository/rocket_python_examples/test.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | def dummy_func(x):
4 |     return "Dummy code - {}".format(str(x))
5 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/private-pypi-repository/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.6
 2 | #
 3 | # © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 4 | #
 5 | # This software – including all its source code – contains proprietary
 6 | # information of Stratio Big Data Inc., Sucursal en España and
 7 | # may not be revealed, sold, transferred, modified, distributed or
 8 | # otherwise made available, licensed or sublicensed to third parties;
 9 | # nor reverse engineered, disassembled or decompiled, without express
10 | # written authorization from Stratio Big Data Inc., Sucursal en España.
11 | #
12 | 
13 | import os
14 | 
15 | from setuptools import setup
16 | 
17 | pjoin = os.path.join
18 | 
19 | here = os.path.abspath(os.path.dirname(__file__))
20 | 
21 | packages = []
22 | for d, _, _ in os.walk('rocket_python_examples'):
23 |     if os.path.exists(pjoin(d, '__init__.py')):
24 |         packages.append(d.replace(os.path.sep, '.'))
25 | 
26 | 
27 | def setup_package():
28 |     metadata = dict(
29 |         name='rocket_python_examples',
30 |         packages=packages,
31 |         description="""Rocket python examples""",
32 |         long_description="Stratio Rocket - python examples",
33 |         author="Stratio Rocket",
34 |         platforms="Linux",
35 |         install_requires=[],
36 |         version="0.1.0",
37 |         keywords=['Rocket', 'Python'],
38 |         classifiers=['Programming Language :: Python :: 3.7'],
39 |     )
40 | 
41 |     setup(**metadata)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     setup_package()
46 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/make_packages.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Current script directory
4 | DIR="$(dirname "$(readlink -f "$0")")"
5 | cd $DIR
6 | 
7 | ./module_build_1/do_package.sh
8 | ./module_build_2/do_package.sh


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r user1_module.zip my_module
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/user1_module.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/user.py:
--------------------------------------------------------------------------------
1 | 
2 | def user_func(x):
3 |     return f"Hello '{x}' I'm user1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r user2_module.zip my_module
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/user2_module.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/user.py:
--------------------------------------------------------------------------------
1 | 
2 | def user_func(x):
3 |     return f"Hello '{x}' I'm user2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user1_module.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user1_module.zip


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user2_module.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user2_module.zip


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/make_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | ./test_pyfile_egg_from_hdfs/do_package.sh
 8 | ./test_pyfile_egg_from_http/do_package.sh
 9 | ./test_pyfile_zip_from_hdfs/do_package.sh
10 | ./test_pyfile_zip_from_http/do_package.sh


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging
 8 | python3 setup.py sdist bdist_egg
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/dist/test_pyfile_egg_pkg_from_hdfs-0.1.0-py*.egg $DIR/../.
12 | 
13 | # Remove folders created during packaging
14 | rm -rf build
15 | rm -rf dist
16 | rm -rf test_pyfile_egg_pkg_from_hdfs.egg-info


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='test_pyfile_egg_pkg_from_hdfs',
 5 |     version='0.1.0',
 6 |     description='A short description',
 7 |     long_description='A long description',
 8 |     author='Rocket',
 9 |     author_email='rocket@stratio.com',
10 |     packages=find_packages(exclude=['*tests*']),
11 | )
12 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/test_pyfile_egg.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_egg(x):
3 |     return f"Hello '{x}' from egg pkg placed in Hdfs - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging
 8 | python3 setup.py sdist bdist_egg
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/dist/test_pyfile_egg_pkg_from_http-0.1.0-py*.egg $DIR/../.
12 | 
13 | # Remove folders created during packaging
14 | rm -rf build
15 | rm -rf dist
16 | rm -rf test_pyfile_egg_pkg_from_http.egg-info


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='test_pyfile_egg_pkg_from_http',
 5 |     version='0.1.0',
 6 |     description='A short description',
 7 |     long_description='A long description',
 8 |     author='Rocket',
 9 |     author_email='rocket@stratio.com',
10 |     packages=find_packages(exclude=['*tests*']),
11 | )
12 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/test_pyfile_egg.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_egg(x):
3 |     return f"Hello '{x}' from egg pkg placed in a Http server - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_from_hdfs.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_script(x):
3 |     return f"Hello '{x}' from python script placed in Hdfs - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_from_http.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_script(x):
3 |     return f"Hello '{x}' from python script placed in a Http server - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r test_pyfile_zip_pkg_from_hdfs.zip test_pyfile_zip_pkg_from_hdfs
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/test_pyfile_zip_pkg_from_hdfs.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/test_pyfile_zip.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_zip(x):
3 |     return f"Hello '{x}' from python zip pkg placed in Hdfs  - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r test_pyfile_zip_pkg_from_http.zip test_pyfile_zip_pkg_from_http
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/test_pyfile_zip_pkg_from_http.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/test_pyfile_zip.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_zip(x):
3 |     return f"Hello '{x}' from python zip pkg placed in a Http server - V1"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_hdfs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_hdfs.zip


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_http.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_http.zip


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/make_packages.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | ./test_pyfile_egg_from_hdfs/do_package.sh
 8 | ./test_pyfile_egg_from_http/do_package.sh
 9 | ./test_pyfile_zip_from_hdfs/do_package.sh
10 | ./test_pyfile_zip_from_http/do_package.sh


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging
 8 | python3 setup.py sdist bdist_egg
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/dist/test_pyfile_egg_pkg_from_hdfs-0.1.0-py*.egg $DIR/../.
12 | 
13 | # Remove folders created during packaging
14 | rm -rf build
15 | rm -rf dist
16 | rm -rf test_pyfile_egg_pkg_from_hdfs.egg-info


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='test_pyfile_egg_pkg_from_hdfs',
 5 |     version='0.1.0',
 6 |     description='A short description',
 7 |     long_description='A long description',
 8 |     author='Rocket',
 9 |     author_email='rocket@stratio.com',
10 |     packages=find_packages(exclude=['*tests*']),
11 | )
12 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/test_pyfile_egg.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_egg(x):
3 |     return f"Hello '{x}' from egg pkg placed in Hdfs - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging
 8 | python3 setup.py sdist bdist_egg
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/dist/test_pyfile_egg_pkg_from_http-0.1.0-py*.egg $DIR/../.
12 | 
13 | # Remove folders created during packaging
14 | rm -rf build
15 | rm -rf dist
16 | rm -rf test_pyfile_egg_pkg_from_http.egg-info


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='test_pyfile_egg_pkg_from_http',
 5 |     version='0.1.0',
 6 |     description='A short description',
 7 |     long_description='A long description',
 8 |     author='Rocket',
 9 |     author_email='rocket@stratio.com',
10 |     packages=find_packages(exclude=['*tests*']),
11 | )
12 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/test_pyfile_egg.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_egg(x):
3 |     return f"Hello '{x}' from egg pkg placed in a Http server - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_from_hdfs.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_script(x):
3 |     return f"Hello '{x}' from python script placed in Hdfs - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_from_http.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_script(x):
3 |     return f"Hello '{x}' from python script placed in a Http server - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r test_pyfile_zip_pkg_from_hdfs.zip test_pyfile_zip_pkg_from_hdfs
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/test_pyfile_zip_pkg_from_hdfs.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/test_pyfile_zip.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_zip(x):
3 |     return f"Hello '{x}' from python zip pkg placed in Hdfs - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/do_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Current script directory
 4 | DIR="$(dirname "$(readlink -f "$0")")"
 5 | cd $DIR
 6 | 
 7 | # Packaging - Zipping module
 8 | zip -r test_pyfile_zip_pkg_from_http.zip test_pyfile_zip_pkg_from_http
 9 | 
10 | # Move packaged artifact to parent folder
11 | mv $DIR/test_pyfile_zip_pkg_from_http.zip $DIR/../.


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/test_pyfile_zip.py:
--------------------------------------------------------------------------------
1 | 
2 | def func_test_pyfile_zip(x):
3 |     return f"Hello '{x}' from python zip pkg placed in a Http server - V2"
4 | 


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_hdfs.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_hdfs.zip


--------------------------------------------------------------------------------
/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_http.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_http.zip


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/create_pipeline_estimator.py:
--------------------------------------------------------------------------------
 1 | from pyspark.ml import Pipeline
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | from rocket_pyspark_ml.simple_custom_estimator import NormalDeviation
 5 | 
 6 | spark = SparkSession.builder \
 7 |     .master("local") \
 8 |     .appName("test") \
 9 |     .getOrCreate()
10 | 
11 | df = spark.sparkContext.parallelize([(1, 2.0), (2, 3.0), (3, 0.0), (4, 99.0)]).toDF(["id", "x"])
12 | 
13 | normal_deviation = NormalDeviation().setInputCol("x").setCenteredThreshold(1.0)
14 | 
15 | pipeline = Pipeline(stages=[normal_deviation])
16 | 
17 | model = pipeline.fit(df)
18 | 
19 | out_df = model.transform(df)
20 | out_df.show()
21 | 
22 | pipeline.write().overwrite().save("/tmp/my_custom_pipeline")
23 | model.write().overwrite().save("/tmp/my_custom_model_from_custom_pipeline")
24 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/create_pipeline_model.py:
--------------------------------------------------------------------------------
 1 | from pyspark.ml import Pipeline, PipelineModel
 2 | from pyspark.ml.classification import LogisticRegression
 3 | from pyspark.ml.feature import HashingTF, Tokenizer
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | from rocket_pyspark_ml.simple_custom_transformer import LiteralColumnAdder
 7 | 
 8 | spark = SparkSession.builder \
 9 |     .master("local") \
10 |     .appName("test") \
11 |     .getOrCreate()
12 | 
13 | # Prepare training documents from a list of (id, text, label) tuples.
14 | df = spark.createDataFrame([
15 |     (0, "a b c d e spark", 1.0),
16 |     (1, "b d", 0.0),
17 |     (2, "spark f g h", 1.0),
18 |     (3, "hadoop mapreduce", 3.0)
19 | ], ["id", "text", "label"])
20 | 
21 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
22 | tokenizer = Tokenizer(inputCol="text", outputCol="words")
23 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
24 | lr = LogisticRegression(maxIter=10, regParam=0.001)
25 | 
26 | # Custom transformer
27 | custom = LiteralColumnAdder()
28 | 
29 | 
30 | sub_pipeline = Pipeline(stages=[custom, tokenizer, hashingTF, lr])
31 | model = sub_pipeline.fit(df)
32 | 
33 | model.write().overwrite().save("/tmp/my_custom_model")
34 | 
35 | loaded_model = PipelineModel.load("/tmp/my_custom_model")
36 | 
37 | loaded_model.transform(df).show()


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/dist/rocket_pyspark_ml-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-pyspark-ml/dist/rocket_pyspark_ml-0.1.0.tar.gz


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Python package with custom PySpark estimator and transformer
 3 | 
 4 | #### How to upload to Nexus
 5 | 
 6 | Upload → utilidad twine
 7 | 
 8 |     pip instal twine
 9 |     
10 | Fichero de configuración para apuntar a repositorio externo:
11 | 
12 | gedit ~/.pypirc
13 | 
14 |     [distutils]
15 |     index-servers = pypi
16 |     [pypi]
17 |     repository: https://nexus.s000001.xray.labs.stratio.com/repository/rocket-pip-internal/
18 |     username: admin
19 |     password: 1234
20 | 
21 | Por linea de comandos:
22 | 
23 |     twine upload XXX.tar.gz --cert ~/workspace/entornos/xray/ca.crt --verbose
24 | 
25 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/__init__.py


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/simple_custom_transformer.py:
--------------------------------------------------------------------------------
 1 | from pyspark import keyword_only
 2 | from pyspark.ml import Transformer
 3 | from pyspark.ml.param.shared import *
 4 | from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
 5 | from pyspark.sql import DataFrame
 6 | from pyspark.sql.functions import lit
 7 | 
 8 | 
 9 | class HasLiteralValue(Params):
10 |     literalValue = Param(
11 |         Params._dummy(), "literalValue", "literalValue", typeConverter=TypeConverters.toFloat
12 |     )
13 | 
14 |     def __init__(self):
15 |         super(HasLiteralValue, self).__init__()
16 |         self._setDefault(literalValue=1.0)
17 | 
18 |     def setLiteralValue(self, value):
19 |         return self._set(literalValue=value)
20 | 
21 |     def getLiteralValue(self):
22 |         return self.getOrDefault(self.literalValue)
23 | 
24 | 
25 | class LiteralColumnAdder(Transformer, HasLiteralValue, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
26 |     """
27 |     A custom Transformer which drops all columns that have at least one of the
28 |     words from the banned_list in the name.
29 |     """
30 | 
31 |     @keyword_only
32 |     def __init__(self):
33 |         super(LiteralColumnAdder, self).__init__()
34 | 
35 |     def _transform(self, df: DataFrame) -> DataFrame:
36 |         return df.withColumn(self.getOutputCol(), lit(self.getLiteralValue()))
37 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/test.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | def dummy_func(x):
4 |     return "Dummy code - {}".format(str(x))
5 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-pyspark-ml/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.6
 2 | #
 3 | # © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 4 | #
 5 | # This software – including all its source code – contains proprietary
 6 | # information of Stratio Big Data Inc., Sucursal en España and
 7 | # may not be revealed, sold, transferred, modified, distributed or
 8 | # otherwise made available, licensed or sublicensed to third parties;
 9 | # nor reverse engineered, disassembled or decompiled, without express
10 | # written authorization from Stratio Big Data Inc., Sucursal en España.
11 | #
12 | 
13 | import os
14 | 
15 | from setuptools import setup
16 | 
17 | pjoin = os.path.join
18 | 
19 | here = os.path.abspath(os.path.dirname(__file__))
20 | 
21 | packages = []
22 | for d, _, _ in os.walk('rocket_pyspark_ml'):
23 |     if os.path.exists(pjoin(d, '__init__.py')):
24 |         packages.append(d.replace(os.path.sep, '.'))
25 | 
26 | 
27 | def setup_package():
28 |     metadata = dict(
29 |         name='rocket_pyspark_ml',
30 |         packages=packages,
31 |         description="""Rocket Pyspark ml""",
32 |         long_description="Stratio Rocket - PySpark ml custom stages",
33 |         author="Stratio Rocket",
34 |         platforms="Linux",
35 |         install_requires=[],
36 |         version="0.1.0",
37 |         keywords=['Rocket', 'PySpark', "ML"],
38 |         classifiers=['Programming Language :: Python :: 3.7'],
39 |     )
40 | 
41 |     setup(**metadata)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     setup_package()
46 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-spark-ml/dist/rocketSparkMl-0.1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-spark-ml/dist/rocketSparkMl-0.1.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-spark-ml/src/main/scala/org/apache/spark/ml/rocket/features/SimpleCustomTransformer.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.rocket.features
 2 | 
 3 | import org.apache.spark.ml.Transformer
 4 | import org.apache.spark.ml.param.{Param, ParamMap}
 5 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.types._
 8 | import org.apache.spark.sql.{DataFrame, Dataset}
 9 | 
10 | /*
11 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
12 |  *
13 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
14 |  */
15 | 
16 | 
17 | 
18 | class ConfigurableWordCount(
19 |                              override val uid: String
20 |                            ) extends Transformer with DefaultParamsWritable  {
21 | 
22 |   def this() = this(Identifiable.randomUID("configurablewordcount"))
23 | 
24 |   // -----------------
25 |   // Parameters
26 |   // -----------------
27 | 
28 |   final val inputCol = new Param[String](this, "inputCol", "The input column")
29 |   final val outputCol = new Param[String](this, "outputCol", "The output column")
30 | 
31 |   def setInputCol(value: String): this.type = set(inputCol, value)
32 | 
33 |   def setOutputCol(value: String): this.type = set(outputCol, value)
34 | 
35 |   override def transformSchema(schema: StructType): StructType = {
36 |     // Check that the input type is a string
37 |     val idx = schema.fieldIndex($(inputCol))
38 |     val field = schema.fields(idx)
39 |     if (field.dataType != StringType) {
40 |       throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
41 |     }
42 |     // Add the return field
43 |     schema.add(StructField($(outputCol), IntegerType, false))
44 |   }
45 | 
46 |   def transform(df: Dataset[_]): DataFrame = {
47 |     val wordcount = udf { in: String => in.split(" ").length }
48 |     df.select(col("*"), wordcount(df.col($(inputCol))).as($(outputCol)))
49 |   }
50 | 
51 |   def copy(extra: ParamMap): ConfigurableWordCount = {
52 |     defaultCopy(extra)
53 |   }
54 | }
55 | 
56 | object ConfigurableWordCount extends DefaultParamsReadable[ConfigurableWordCount]{
57 | 
58 |   override def load(path: String): ConfigurableWordCount = super.load(path)
59 | }
60 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/RocketSparkMlBeforeAndAfterAll.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.ml.rocket
 8 | 
 9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.sql.SparkSession
11 | import org.scalatest.{BeforeAndAfterAll, Suite}
12 | 
13 | 
14 | trait RocketSparkMlBeforeAndAfterAll extends BeforeAndAfterAll { self: Suite =>
15 | 
16 |   @transient var spark: SparkSession = _
17 |   @transient var sc: SparkContext = _
18 |   //@transient var sqlContext: SQLContext = _
19 | 
20 |   override def beforeAll() {
21 |     super.beforeAll()
22 |     val sparkMasterIp = System.getProperty("spark.master", "local[2]")
23 |     spark = SparkSession
24 |       .builder().master(sparkMasterIp)
25 |       .appName("RocketSparkMlUnitTest")
26 |       .getOrCreate()
27 |     sc = spark.sparkContext
28 |   }
29 | 
30 |   override def afterAll() {
31 |     if(spark != null)
32 |       spark.stop()
33 |     super.afterAll()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/RocketSparkMlFunSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.ml.rocket
 8 | 
 9 | import org.scalatest.{FunSuite, Outcome}
10 | 
11 | 
12 | abstract class RocketSparkMlFunSuite extends FunSuite with RocketSparkMlBeforeAndAfterAll {
13 | 
14 |   /**
15 |     * Log the suite name and the test name before and after each test.
16 |     *
17 |     * Subclasses should never override this method. If they wish to run
18 |     * custom code before and after each test, they should mix in the
19 |     * {{org.scalatest.BeforeAndAfter}} trait instead.
20 |     */
21 | 
22 |   final protected override def withFixture(test: NoArgTest): Outcome = {
23 |     val testName = test.text
24 |     val suiteName = this.getClass.getName
25 |     val shortSuiteName = suiteName.replaceAll("com.stratio.intelligence", "c.s.i")
26 |     try {
27 |       print(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n")
28 |       test()
29 | 
30 |     } finally {
31 |       print(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n")
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/features/SimpleCustomEstimatorTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.ml.rocket.features
 8 | 
 9 | import org.apache.spark.ml.{Pipeline, PipelineModel}
10 | import org.apache.spark.ml.classification.LogisticRegression
11 | import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
12 | import org.apache.spark.ml.rocket.RocketSparkMlFunSuite
13 | import org.junit.runner.RunWith
14 | import org.scalatestplus.junit.JUnitRunner
15 | 
16 | @RunWith(classOf[JUnitRunner])
17 | class SimpleCustomEstimatorTest extends RocketSparkMlFunSuite{
18 | 
19 |   test("Example"){
20 | 
21 |     // => Prepare training documents from a list of (id, text, label) tuples.
22 |     val df = spark.createDataFrame(Seq(
23 |       (0L, "a b c d e spark", 1.0, "cat0"),
24 |       (1L, "b d", 0.0, "cat0"),
25 |       (2L, "spark f g h", 1.0, "cat1"),
26 |       (3L, "hadoop mapreduce", 0.0, "cat2")
27 |     )).toDF("id", "text", "label", "cat")
28 | 
29 |     // · Feature engineering
30 |     val si = new SimpleIndexer().setInputCol("cat").setOutputCol("idx_cat")
31 |     val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
32 |     val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
33 |     val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001)
34 |     val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, si, lr))
35 | 
36 | 
37 |     val pipelineModel = pipeline.fit(df)
38 | 
39 |     val outDf = pipelineModel.transform(df)
40 |     outDf.show()
41 | 
42 |     pipeline.write.overwrite().save("/tmp/custom_pipeline")
43 |     val newPipeline = Pipeline.load("/tmp/custom_pipeline")
44 | 
45 |     pipelineModel.write.overwrite().save("/tmp/custom_pipeline_model")
46 |     val newPipelineModel = PipelineModel.load("/tmp/custom_pipeline_model")
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/scripts/schema_convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import json
 5 | 
 6 | 
 7 | def true_str(language: str):
 8 |     return 'true' if language == 'scala' else 'True'
 9 | 
10 | 
11 | def other_type(type_str: str, language: str):
12 |     return f'{type_str.capitalize()}Type{"()" if language == "python" else ""}'
13 | 
14 | 
15 | def to_array(fields: list, language: str):
16 |     return f'Seq({",".join(fields)})' if language == 'scala' else f'[{",".join(fields)}]'
17 | 
18 | 
19 | def struct_type(fields: list, language: str, name: str = None):
20 |     array_str = to_array(fields, language)
21 |     if name is not None:
22 |         return f'StructField("{name}", StructType({array_str}), {true_str(language)})'
23 |     else:
24 |         return f'StructType({array_str})'
25 | 
26 | 
27 | def obtain_schema(node: dict, language: str):
28 |     internal_node = node.get('type')
29 |     if internal_node == 'struct':
30 |         # Root struct
31 |         fields = [obtain_schema(elem, language)
32 |                   for elem in node.get('fields', [])]
33 |         return struct_type(fields, language)
34 |     elif type(internal_node) == dict:
35 |         # Internal struct or array type
36 |         if internal_node.get('type') == 'struct':
37 |             fields = [obtain_schema(elem, language)
38 |                       for elem in internal_node.get('fields', [])]
39 |             return struct_type(fields, language, node.get("name"))
40 |         elif internal_node.get('type') == 'array':
41 |             element_type = internal_node.get('elementType', [])
42 |             if type(element_type) == dict:
43 |                 type_str = obtain_schema(element_type, language)
44 |             else:
45 |                 type_str = other_type(element_type, language)
46 |             return f'StructField("{node.get("name")}", ArrayType({type_str}), {true_str(language)})'
47 | 
48 |     else:
49 |         # Other internal type
50 |         return f'StructField("{node.get("name")}", {other_type(internal_node,language)}, {true_str(language)})'
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     parser = argparse.ArgumentParser(
55 |         description='Convert Rocket debug schema to Scala/Python schema')
56 |     parser.add_argument(
57 |         "--input",
58 |         type=str,
59 |         help="Input file containing debug schema",
60 |         required=True
61 |     )
62 |     parser.add_argument(
63 |         "--language",
64 |         type=str,
65 |         default='scala',
66 |         help="Language of the schema, must be 'scala' or 'python'",
67 |         choices=['python', 'scala']
68 |     )
69 |     args = parser.parse_args()
70 |     with open(args.input) as file:
71 |         print(obtain_schema(json.load(file), args.language))
72 | 


--------------------------------------------------------------------------------
/scripts/sso.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
 7 | 
 8 | # Disable InsecureRequestWarning from the logs
 9 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
10 | 
11 | needed_cookies = ["user", "mesosphere_server_id"]
12 | 
13 | def login_sso(url, username, password):
14 |     """
15 |     Function that simulates the login in to sparta endpoint flow with SSO to obtain a valid
16 |     cookie that will be used to make requests to Marathon
17 |     """
18 |     # First request to mesos master to be redirected to gosec sso login
19 |     # page and be given a session cookie
20 |     r = requests.Session()
21 |     first_response = r.get(url, verify=False)
22 |     callback_url = first_response.url
23 | 
24 |     # Parse response body for hidden tags needed in the data of our login post request
25 |     body = first_response.text
26 |     all_tags = BeautifulSoup(body, "lxml").find_all("input", type="hidden")
27 |     tags_to_find = ['lt', 'execution']
28 |     hidden_tags = [tag.attrs for tag in all_tags if tag['name'] in tags_to_find]
29 |     data = {tag['name']: tag['value'] for tag in hidden_tags}
30 | 
31 |     # Add the rest of needed fields and login credentials in the data of
32 |     # our login post request and send it
33 |     data.update({
34 |         '_eventId': 'submit',
35 |         'submit': 'LOGIN',
36 |         'username': username,
37 |         'password': password,
38 |         'tenant': 'NONE'
39 |     })
40 |     login_response = r.post(callback_url, data=data, verify=False)
41 |     return login_response
42 | 
43 | 
44 | def get_cookies(url, usr, passwd):
45 |     for (k,v) in login_sso(url, usr, passwd).request._cookies.items():
46 |         if k in needed_cookies:
47 |             yield "{}={}".format(k,v)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     parser = argparse.ArgumentParser(description='Obtain SSO cookies.')
52 |     parser.add_argument(
53 |         "--user",
54 |         dest="user",
55 |         type=str,
56 |         help="User to login in SSO",
57 |         nargs="?"
58 |     )
59 |     parser.add_argument(
60 |         "--password",
61 |         dest="password",
62 |         type=str,
63 |         help="Password to login in SSO",
64 |         nargs="?"
65 |     )
66 |     parser.add_argument(
67 |         "--url",
68 |         dest="url",
69 |         type=str,
70 |         help="Password to login in SSO",
71 |         nargs="?"
72 |     )
73 |     args = parser.parse_args()
74 |     cookies = "; ".join(get_cookies(args.url, args.user, args.password))
75 |     print("Cookie: {}".format(cookies))


--------------------------------------------------------------------------------
/sparta-plugins/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.batch.models._
 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult
10 | import com.stratio.sparta.sdk.lite.xd.batch._
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
13 | import org.apache.spark.sql.crossdata.XDSession
14 | import org.apache.spark.sql.types._
15 | 
16 | class GeneratorXDLiteInputStepBatch(
17 |                                    xdSession: XDSession,
18 |                                    properties: Map[String, String]
19 |                                  )
20 |   extends LiteCustomXDBatchInput(xdSession, properties) {
21 | 
22 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
23 |   lazy val rawData: Option[String] = properties.get("raw").map(_.toString)
24 | 
25 | 
26 |   override def validate(): ValidationResult = {
27 |     var validation = ValidationResult(valid = true, messages = Seq.empty)
28 | 
29 |     if (rawData.isEmpty) {
30 |       validation = ValidationResult(
31 |         valid = false,
32 |         messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'")
33 |     }
34 | 
35 |     if (rawData.map(_.trim).forall(_.isEmpty)) {
36 |       validation = ValidationResult(
37 |         valid = false,
38 |         messages = validation.messages :+ "Generated data cannot be an empty string")
39 |     }
40 |     validation
41 |   }
42 | 
43 |   override def init(): ResultBatchData = {
44 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
45 |     val defaultRDD = xdSession.sparkContext.parallelize(register)
46 | 
47 |     ResultBatchData(defaultRDD, Option(stringSchema))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/sparta-plugins/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.streaming.dstream.DStream
10 | import org.apache.spark.rdd.RDD
11 | import org.apache.spark.sql.functions._
12 | import com.stratio.sparta.sdk.lite.xd.streaming._
13 | import com.stratio.sparta.sdk.lite.streaming.models._
14 | import org.apache.spark.sql.crossdata.XDSession
15 | 
16 | import scala.util.{Failure, Success, Try}
17 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
18 | import org.apache.spark.sql.types._
19 | import org.apache.spark.streaming.StreamingContext
20 | 
21 | import scala.collection.mutable
22 | 
23 | class GeneratorXDLiteInputStepStreaming(
24 |                                        xdSession: XDSession,
25 |                                        streamingContext: StreamingContext,
26 |                                        properties: Map[String, String]
27 |                                      )
28 |   extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) {
29 | 
30 |   lazy val stringSchema = StructType(Seq(StructField("raw", StringType)))
31 | 
32 |   override def init(): ResultStreamingData = {
33 |     val dataQueue = new mutable.Queue[RDD[Row]]()
34 |     val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row])
35 |     dataQueue += xdSession.sparkContext.parallelize(register)
36 |     val stream = streamingContext.queueStream(dataQueue)
37 | 
38 |     ResultStreamingData(stream, Option(stringSchema))
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/sparta-plugins/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta
 7 | 
 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions
 9 | import com.stratio.sparta.sdk.lite.xd.common._
10 | import org.apache.spark.sql._
11 | import org.apache.spark.sql.crossdata.XDSession
12 | 
13 | import scala.util.{Failure, Success, Try}
14 | 
15 | class LoggerXDLiteOutputStep(
16 |                               xdSession: XDSession,
17 |                               properties: Map[String, String]
18 |                             )
19 |   extends LiteCustomXDOutput(xdSession, properties) {
20 | 
21 |   lazy val metadataEnabled = properties.get("metadataEnabled") match {
22 |     case Some(value: String) => Try(value.toBoolean) match {
23 |       case Success(v) => v
24 |       case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex)
25 |     }
26 |     case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties")
27 |   }
28 | 
29 |   override def save(data: DataFrame, outputOptions: OutputOptions): Unit = {
30 |     val tableName = outputOptions.tableName.getOrElse{
31 |       logger.error("Table name not defined")
32 |       throw new NoSuchElementException("tableName not found in options")}
33 | 
34 |     if (metadataEnabled){
35 |       logger.info(s"Table name: $tableName")
36 |       logger.info(s"Save mode is set to ${outputOptions.saveMode}")
37 |     }
38 |     data.foreach{ row =>
39 |       println(row.mkString(","))
40 |     }
41 |   }
42 | 
43 |   override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = ()
44 | }
45 | 


--------------------------------------------------------------------------------
/sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.properties
 7 | 
 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){
 9 |   def getString(key: K): String =
10 |     m.get(key) match {
11 |       case Some(value: String) => value
12 |       case Some(value) => value.toString
13 |       case None =>
14 |         throw new IllegalStateException(s"$key is mandatory")
15 |     }
16 | 
17 |   def notBlank(option: Option[String]): Boolean =
18 |     option.map(_.trim).forall(_.isEmpty)
19 | }
20 | 
21 | class NotBlankOption(s: Option[String]) {
22 |   def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty)
23 | }
24 | 
25 | object ValidatingPropertyMap{
26 |   implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] =
27 |     new ValidatePropertiesMap[K, V](m)
28 | 
29 |   implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s)
30 | }


--------------------------------------------------------------------------------
/sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import java.io.{Serializable => JSerializable}
 9 | 
10 | import com.stratio.sparta.sdk.lite.batch.models._
11 | import com.stratio.sparta.sdk.lite.xd.batch._
12 | import org.apache.spark.sql.crossdata.XDSession
13 | 
14 | class RepartitionXDLiteTransformStepBatch(
15 |                                          xdSession: XDSession,
16 |                                          properties: Map[String, String]
17 |                                        ) extends LiteCustomXDBatchTransform(xdSession, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = {
20 |     val inputStream = inputData.head._2.data
21 | 
22 |     OutputBatchTransformData(inputStream.repartition(5))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | package com.stratio.sparta.transformations.repartition
 7 | 
 8 | import com.stratio.sparta.sdk.lite.streaming.models._
 9 | import com.stratio.sparta.sdk.lite.xd.streaming._
10 | import org.apache.spark.sql.crossdata.XDSession
11 | import org.apache.spark.streaming.StreamingContext
12 | 
13 | class RepartitionXDLiteTransformStepStreaming(
14 |                                              xdSession: XDSession,
15 |                                              streamingContext: StreamingContext,
16 |                                              properties: Map[String, String]
17 |                                            ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) {
18 | 
19 |   override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = {
20 |     val newStream = inputData.head._2.data.transform { rdd =>
21 |       rdd.repartition(5)
22 |     }
23 | 
24 |     OutputStreamingTransformData(newStream)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/sparta-plugins/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stratio.sparta.transformations.tokenizer
 2 | 
 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.Row
 6 | import org.apache.spark.sql.crossdata.XDSession
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | import org.junit.runner.RunWith
10 | import org.scalatest.junit.JUnitRunner
11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
12 | 
13 | @RunWith(classOf[JUnitRunner])
14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll {
15 |   val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]"))
16 |   val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser")
17 | 
18 |   val names = "jose,perez"
19 |   val inputField = "raw"
20 |   val inputSchema = StructType(Seq(StructField(inputField, StringType)))
21 |   val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names)))
22 | 
23 |   val properties = Map(
24 |     "charPattern" -> ",",
25 |     "inputField" -> "raw",
26 |     "outputField1" -> "firstName",
27 |     "outputField2" -> "lastName"
28 |   )
29 | 
30 |   val inBatch = ResultBatchData(dataIn, Option(inputSchema))
31 |   val tokenizer = new TokenizerTransformStepBatch(xdSession, properties)
32 | 
33 |   "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in {
34 |     val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq
35 | 
36 |     result.size shouldBe 2
37 |   }
38 | }


--------------------------------------------------------------------------------
/sparta-plugins/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package com.stratio.sparta
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.catalyst.ScalaReflection
11 | import org.apache.spark.sql.expressions.UserDefinedFunction
12 | import org.apache.spark.sql.types.StringType
13 | 
14 | import scala.util.Try
15 | 
16 | case class ToUpperCaseUDF() extends SpartaUDF {
17 | 
18 |   val name = "uppercaseSparta"
19 | 
20 |   val upper: String => String = _.toUpperCase
21 | 
22 |   val userDefinedFunction: UserDefinedFunction =
23 |     UserDefinedFunction(upper , StringType, Option(Seq(StringType)))
24 | }
25 | 
26 | case class ConcatUDF() extends SpartaUDF {
27 | 
28 |   val name = "concatSparta"
29 | 
30 |   val upper: (String, String) => String =  { case (str1, str2) =>
31 |     s"$str1/$str2"
32 |   }
33 | 
34 |   val userDefinedFunction: UserDefinedFunction =
35 |     UserDefinedFunction(upper , StringType, Option(Seq(StringType, StringType)))
36 | }
37 | 
38 | case class ToUpperCaseWithReflectionUDF() extends SpartaUDF {
39 | 
40 |   val name = "upperCaseReflect"
41 | 
42 |   val upper: String => String = _.toUpperCase
43 | 
44 |   val userDefinedFunction: UserDefinedFunction = {
45 |     val inputTypes = Try(ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType :: Nil).toOption
46 |     UserDefinedFunction(upper , ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType, inputTypes)
47 |   }
48 | }


--------------------------------------------------------------------------------
/sparta-plugins/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved.
 3 |  *
 4 |  * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España.
 5 |  */
 6 | 
 7 | package org.apache.spark.sql.sparta.udf
 8 | 
 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF
10 | import org.apache.spark.sql.expressions.UserDefinedFunction
11 | import org.apache.spark.sql.types.{DoubleType, IntegerType}
12 | import org.apache.spark.ml.linalg.VectorUDT
13 | 
14 | case object VectorUDT extends VectorUDT
15 | 
16 | case class GetDenseVectorUDF() extends SpartaUDF {
17 | 
18 |   val name = "get_vector_ith_element"
19 | 
20 |   val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num)
21 | 
22 |   val userDefinedFunction: UserDefinedFunction =
23 |     UserDefinedFunction(getVectorElement , DoubleType, Option(Seq(VectorUDT, IntegerType)))
24 | }
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------