├── .gitignore ├── README.md ├── horovod-images ├── DockerfileAnalyticHorovodGpu ├── DockerfileDriverHorovodGpu ├── DockerfileExecutorHorovodGpu ├── README.md └── mlflow_venv │ └── conda.yaml ├── mlflow_mlprojects ├── mlflow-devEnv-archetype │ ├── Readme.md │ ├── __init__.py │ ├── clean_venvs_mlflow_runs.sh │ ├── create_vEnv_mlproject_command.sh │ ├── create_vEnv_spark_inference.sh │ ├── data │ │ └── train.csv │ ├── doc │ │ └── .keep │ ├── launcher_mlproject_command.py │ ├── launcher_spark_inference.py │ ├── mlflow_runs │ │ └── .keep │ ├── mlproject │ │ ├── MLproject │ │ ├── __init__.py │ │ ├── conda.yaml │ │ └── train.py │ ├── spark_inference │ │ ├── conda.yaml │ │ └── model │ │ │ └── .keep │ ├── utils │ │ └── install_conda.sh │ └── venvs │ │ ├── launcher │ │ └── .keep │ │ ├── mlproject │ │ └── .keep │ │ └── spark_inference │ │ └── .keep └── mlflow-devEnv-custom-model │ ├── __init__.py │ ├── clean_venvs_mlflow_runs.sh │ ├── create_vEnv_mlproject_command.sh │ ├── create_vEnv_spark_inference.sh │ ├── data │ └── train.csv │ ├── doc │ └── .keep │ ├── launcher_mlproject_command.py │ ├── launcher_spark_inference.py │ ├── mlflow_runs │ └── .keep │ ├── mlproject │ ├── MLproject │ ├── __init__.py │ ├── conda.yaml │ └── train.py │ ├── spark_inference │ ├── conda.yaml │ └── model │ │ ├── .keep │ │ ├── MLmodel │ │ ├── conda.yaml │ │ └── python_model.pkl │ ├── utils │ └── install_conda.sh │ └── venvs │ ├── launcher │ └── .keep │ ├── mlproject │ └── .keep │ └── spark_inference │ └── .keep ├── rocket-extensions ├── Readme.md ├── old-extensions │ ├── input-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ │ ├── GeneratorXDLiteInputStepStreaming.scala │ │ │ └── MetadataTestXDLiteInputStepBatch.scala │ ├── output-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── LoggerXDLiteOutputStep.scala │ ├── transformation-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── stratio │ │ │ │ └── sparta │ │ │ │ ├── properties │ │ │ │ └── ValidatePropertiesMap.scala │ │ │ │ └── transformations │ │ │ │ ├── repartition │ │ │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ │ │ └── tokenizer │ │ │ │ ├── TokenizerTransformStepBatch.scala │ │ │ │ └── TokenizerTransformStepStreaming.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── transformations │ │ │ └── tokenizer │ │ │ └── TokenizerTransformStepBatchTest.scala │ └── udf │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ ├── com │ │ └── stratio │ │ │ └── sparta │ │ │ └── SpartaExampleUDFs.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── sparta │ │ └── udf │ │ └── GetDenseVectorUDF.scala ├── rocket-1.0.0-SDK │ ├── input-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ │ └── GeneratorXDLiteInputStepStreaming.scala │ ├── output-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── LoggerXDLiteOutputStep.scala │ ├── packaged-jars │ │ ├── custom-lite-input-xd-1.0.0-SNAPSHOT.jar │ │ ├── custom-lite-output-xd-1.0.0-SNAPSHOT.jar │ │ └── custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar │ └── transformation-lite-xd │ │ ├── pom.xml │ │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── properties │ │ │ └── ValidatePropertiesMap.scala │ │ │ └── transformations │ │ │ ├── repartition │ │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ │ └── tokenizer │ │ │ ├── TokenizerTransformStepBatch.scala │ │ │ └── TokenizerTransformStepStreaming.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ └── transformations │ │ └── tokenizer │ │ └── TokenizerTransformStepBatchTest.scala ├── rocket-1.1.0-SDK │ └── input-lite-xd │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ ├── GeneratorXDLiteInputStepStreaming.scala │ │ └── MetadataTestXDLiteInputStepBatch.scala ├── rocket-2.2.0-SDK │ ├── input-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ │ ├── GeneratorXDLiteInputStepStreaming.scala │ │ │ ├── MetadataTestXDLiteInputStepBatch.scala │ │ │ ├── ReportLogTestXDLiteInputStepBatch.scala │ │ │ └── ReportLogTestXDLiteInputStepStreaming.scala │ ├── output-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── LoggerXDLiteOutputStep.scala │ ├── packaged-jars │ │ ├── custom-lite-input-xd-2.2.0-SNAPSHOT.jar │ │ ├── custom-lite-output-xd-2.2.0-SNAPSHOT.jar │ │ ├── custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar │ │ └── custom-lite-udf-2.2.0-SNAPSHOT.jar │ ├── transformation-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── stratio │ │ │ │ └── sparta │ │ │ │ ├── properties │ │ │ │ └── ValidatePropertiesMap.scala │ │ │ │ └── transformations │ │ │ │ ├── column │ │ │ │ └── AddColumnXDLiteTransformStepBatch.scala │ │ │ │ ├── repartition │ │ │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ │ │ └── tokenizer │ │ │ │ ├── TokenizerTransformStepBatch.scala │ │ │ │ └── TokenizerTransformStepStreaming.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── transformations │ │ │ └── tokenizer │ │ │ └── TokenizerTransformStepBatchTest.scala │ └── udf │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ ├── com │ │ └── stratio │ │ │ └── sparta │ │ │ └── SpartaExampleUDFs.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── sparta │ │ └── udf │ │ └── GetDenseVectorUDF.scala ├── rocket-2.3.0-SDK │ ├── input-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ │ ├── GeneratorXDLiteInputStepStreaming.scala │ │ │ ├── MetadataTestXDLiteInputStepBatch.scala │ │ │ ├── ReportLogTestXDLiteInputStepBatch.scala │ │ │ └── ReportLogTestXDLiteInputStepStreaming.scala │ ├── output-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── JdbcWithLineageXDLiteOutputStep.scala │ │ │ └── LoggerXDLiteOutputStep.scala │ ├── transformation-lite-xd │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ └── scala │ │ │ │ └── com │ │ │ │ └── stratio │ │ │ │ └── sparta │ │ │ │ ├── properties │ │ │ │ └── ValidatePropertiesMap.scala │ │ │ │ └── transformations │ │ │ │ ├── column │ │ │ │ └── AddColumnXDLiteTransformStepBatch.scala │ │ │ │ ├── repartition │ │ │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ │ │ └── tokenizer │ │ │ │ ├── TokenizerTransformStepBatch.scala │ │ │ │ └── TokenizerTransformStepStreaming.scala │ │ │ └── test │ │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ └── transformations │ │ │ └── tokenizer │ │ │ └── TokenizerTransformStepBatchTest.scala │ └── udf │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ ├── com │ │ └── stratio │ │ │ └── sparta │ │ │ └── SpartaExampleUDFs.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ └── sparta │ │ └── udf │ │ └── GetDenseVectorUDF.scala └── rocket-3.0.0-SDK │ ├── Readme.md │ ├── input-lite-xd │ ├── pom.xml │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ ├── GeneratorXDLiteInputStepBatch.scala │ │ ├── GeneratorXDLiteInputStepHybrid.scala │ │ ├── GeneratorXDLiteInputStepStreaming.scala │ │ ├── MetadataTestXDLiteInputStepBatch.scala │ │ ├── ReportLogTestXDLiteInputStepBatch.scala │ │ ├── ReportLogTestXDLiteInputStepStreaming.scala │ │ └── StreamGeneratorXDLiteInputStepHybrid.scala │ ├── lineage-qrs │ ├── pom.xml │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ └── CustomLineageQrs.scala │ ├── output-lite-xd │ ├── pom.xml │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ ├── JdbcWithLineageXDLiteOutputStep.scala │ │ └── LoggerXDLiteOutputStep.scala │ ├── packaged-jars │ ├── custom-lite-input-xd-3.0.0-SNAPSHOT.jar │ ├── custom-lite-output-xd-3.0.0-SNAPSHOT.jar │ └── custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar │ ├── transformation-lite-xd │ ├── pom.xml │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── stratio │ │ │ └── sparta │ │ │ ├── properties │ │ │ └── ValidatePropertiesMap.scala │ │ │ └── transformations │ │ │ ├── column │ │ │ ├── AddColumnXDLiteTransformStepBatch.scala │ │ │ └── AddColumnXDLiteTransformStepHybrid.scala │ │ │ ├── repartition │ │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ │ ├── RepartitionXDLiteTransformStepHybrid.scala │ │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ │ └── tokenizer │ │ │ ├── TokenizerTransformStepBatch.scala │ │ │ └── TokenizerTransformStepStreaming.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ └── transformations │ │ └── tokenizer │ │ └── TokenizerTransformStepBatchTest.scala │ └── udf │ ├── pom.xml │ └── src │ └── main │ └── scala │ ├── com │ └── stratio │ │ └── sparta │ │ └── SpartaExampleUDFs.scala │ └── org │ └── apache │ └── spark │ └── sql │ └── sparta │ └── udf │ └── GetDenseVectorUDF.scala ├── rocket-python-extensions ├── conda-pack-extensions │ ├── conda.yaml │ └── do_conda_pack.sh ├── private-pypi-repository │ ├── Readme.md │ ├── dist │ │ └── rocket_python_examples-0.1.0.tar.gz │ ├── rocket_python_examples │ │ ├── __init__.py │ │ └── test.py │ └── setup.py └── pyspark-native-extensions │ └── qa-examples │ ├── Readme.md │ ├── same_module_test │ ├── make_packages.sh │ ├── module_build_1 │ │ ├── do_package.sh │ │ └── my_module │ │ │ ├── __init__.py │ │ │ └── user.py │ ├── module_build_2 │ │ ├── do_package.sh │ │ └── my_module │ │ │ ├── __init__.py │ │ │ └── user.py │ ├── user1_module.zip │ └── user2_module.zip │ └── version_test │ ├── v1 │ ├── make_packages.sh │ ├── test_pyfile_egg_from_hdfs │ │ ├── do_package.sh │ │ ├── setup.py │ │ └── test_pyfile_egg_pkg_from_hdfs │ │ │ ├── __init__.py │ │ │ └── test_pyfile_egg.py │ ├── test_pyfile_egg_from_http │ │ ├── do_package.sh │ │ ├── setup.py │ │ └── test_pyfile_egg_pkg_from_http │ │ │ ├── __init__.py │ │ │ └── test_pyfile_egg.py │ ├── test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg │ ├── test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg │ ├── test_pyfile_from_hdfs.py │ ├── test_pyfile_from_http.py │ ├── test_pyfile_zip_from_hdfs │ │ ├── do_package.sh │ │ └── test_pyfile_zip_pkg_from_hdfs │ │ │ ├── __init__.py │ │ │ └── test_pyfile_zip.py │ ├── test_pyfile_zip_from_http │ │ ├── do_package.sh │ │ └── test_pyfile_zip_pkg_from_http │ │ │ ├── __init__.py │ │ │ └── test_pyfile_zip.py │ ├── test_pyfile_zip_pkg_from_hdfs.zip │ └── test_pyfile_zip_pkg_from_http.zip │ └── v2 │ ├── make_packages.sh │ ├── test_pyfile_egg_from_hdfs │ ├── do_package.sh │ ├── setup.py │ └── test_pyfile_egg_pkg_from_hdfs │ │ ├── __init__.py │ │ └── test_pyfile_egg.py │ ├── test_pyfile_egg_from_http │ ├── do_package.sh │ ├── setup.py │ └── test_pyfile_egg_pkg_from_http │ │ ├── __init__.py │ │ └── test_pyfile_egg.py │ ├── test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg │ ├── test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg │ ├── test_pyfile_from_hdfs.py │ ├── test_pyfile_from_http.py │ ├── test_pyfile_zip_from_hdfs │ ├── do_package.sh │ └── test_pyfile_zip_pkg_from_hdfs │ │ ├── __init__.py │ │ └── test_pyfile_zip.py │ ├── test_pyfile_zip_from_http │ ├── do_package.sh │ └── test_pyfile_zip_pkg_from_http │ │ ├── __init__.py │ │ └── test_pyfile_zip.py │ ├── test_pyfile_zip_pkg_from_hdfs.zip │ └── test_pyfile_zip_pkg_from_http.zip ├── rocket-spark-ml-custom-stages ├── rocket-pyspark-ml │ ├── create_pipeline_estimator.py │ ├── create_pipeline_model.py │ ├── dist │ │ └── rocket_pyspark_ml-0.1.0.tar.gz │ ├── readme.md │ ├── rocket_pyspark_ml │ │ ├── __init__.py │ │ ├── simple_custom_estimator.py │ │ ├── simple_custom_transformer.py │ │ └── test.py │ └── setup.py └── rocket-spark-ml │ ├── dist │ └── rocketSparkMl-0.1.0-SNAPSHOT.jar │ ├── pom.xml │ └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── ml │ │ └── rocket │ │ └── features │ │ ├── SimpleCustomEstimator.scala │ │ └── SimpleCustomTransformer.scala │ └── test │ └── scala │ └── org │ └── apache │ └── spark │ └── ml │ └── rocket │ ├── RocketSparkMlBeforeAndAfterAll.scala │ ├── RocketSparkMlFunSuite.scala │ └── features │ └── SimpleCustomEstimatorTest.scala ├── scripts ├── README.md ├── input.json ├── schema_convert.py └── sso.py └── sparta-plugins ├── input-lite-xd ├── pom.xml └── src │ └── main │ └── scala │ └── com │ └── stratio │ └── sparta │ ├── GeneratorXDLiteInputStepBatch.scala │ └── GeneratorXDLiteInputStepStreaming.scala ├── output-lite-xd ├── pom.xml └── src │ └── main │ └── scala │ └── com │ └── stratio │ └── sparta │ └── LoggerXDLiteOutputStep.scala ├── transformation-lite-xd ├── pom.xml └── src │ ├── main │ └── scala │ │ └── com │ │ └── stratio │ │ └── sparta │ │ ├── properties │ │ └── ValidatePropertiesMap.scala │ │ └── transformations │ │ ├── repartition │ │ ├── RepartitionXDLiteTransformStepBatch.scala │ │ └── RepartitionXDLiteTransformStepStreaming.scala │ │ └── tokenizer │ │ ├── TokenizerTransformStepBatch.scala │ │ └── TokenizerTransformStepStreaming.scala │ └── test │ └── scala │ └── com │ └── stratio │ └── sparta │ └── transformations │ └── tokenizer │ └── TokenizerTransformStepBatchTest.scala └── udf ├── pom.xml └── src └── main └── scala ├── com └── stratio │ └── sparta │ └── SpartaExampleUDFs.scala └── org └── apache └── spark └── sql └── sparta └── udf └── GetDenseVectorUDF.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | *.ser 4 | *.class 5 | *~ 6 | *.bak 7 | #*.off 8 | *.old 9 | 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | .scala_dependencies 16 | 17 | # idea 18 | .idea/ 19 | *.iml 20 | 21 | # building 22 | target 23 | build 24 | null 25 | tmp 26 | temp 27 | test-output 28 | build.log 29 | 30 | # other scm 31 | .svn 32 | .CVS 33 | .hg* 34 | 35 | # switch to regexp syntax. 36 | # syntax: regexp 37 | # ^\.pc/ 38 | 39 | #SHITTY output not in target directory 40 | /dependency-reduced-pom.xml 41 | examples/scripts/temp.json 42 | 43 | # Sparkta specifics 44 | logs 45 | /default 46 | checkpoint 47 | node 48 | node_modules 49 | bower_components 50 | 51 | # Documentation autogenerated 52 | javadoc 53 | apidocs 54 | -------------------------------------------------------------------------------- /horovod-images/README.md: -------------------------------------------------------------------------------- 1 | # rocket-distributed-deep-learning 2 | 3 | This directory contains Dockerfiles that generates sample docker images extending Rocket Driver, Rocket Executor and Analytic Intelligence images, by adding and environment with [Horovod](https://github.com/horovod/horovod/tree/v0.28.1) library compiled with support for Apache Spark, Tensorflow, Keras and PyTorch. 4 | 5 | --- 6 | >**IMPORTANT NOTICE:** 7 | > 8 | >These images **are not intended** to be used in **production environments**. They are intended to be used as a reference for building your own images with the desired libraries and dependencies. 9 | > 10 | >The provided images has been successfully tested in an environment with nodes providing: 11 | >* Nvidia Tesla T4 GPUs 12 | >* Cuda 12.0 13 | >* cuDNN 8.9.0 14 | >* Nvidia Driver 520.61.05 15 | > 16 | >We do not guarantee that these images will work in environments with different GPU vendor or drivers version. 17 | --- 18 | 19 | ## Kyverno policies 20 | In order to deploy Pods with these images, it is necessary to update the Kyverno policies of your namespaces (Rocket and Intelligence) by adding the new image names: 21 | 22 | ### Rocket 23 | _restrict-rocket-images_: 24 | 25 | containers: 26 | - image: >- 27 | */rocket-api:* | */rocket-driver* | */rocket-executor* | 28 | */rocket-ml-prediction-server:* | 29 | */rocket-mleap-microservice:* | 30 | */rocket-mlflow-microservice:* | 31 | */rocket-r-mlflow-microservice:* | */stratio-spark:* 32 | 33 | ### Intelligence 34 | _restrict-intelligence-images_: 35 | 36 | containers: 37 | - image: >- 38 | */intelligence-environment:* | */analytic-environment:* 39 | | */analytic-environment-light:* | 40 | */analytic-environment-horovod-gpu:* | 41 | */intelligence-backup-restore:* | */stratio-spark:* 42 | 43 | Add the pattern matching your image names. -------------------------------------------------------------------------------- /horovod-images/mlflow_venv/conda.yaml: -------------------------------------------------------------------------------- 1 | name: rocket-tensorflow-hdfs 2 | 3 | channels: 4 | - conda-forge 5 | - nodefaults 6 | 7 | dependencies: 8 | - python=3.9.7 9 | - pip=21.2.4 10 | - pip: 11 | - mlflow==2.6.0 12 | - tensorflow==2.11.0 13 | - tensorflow-io==0.25.0 -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/__init__.py -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/clean_venvs_mlflow_runs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | VENVS_PATH=$WD/venvs 5 | MLFLOW_RUNS_PATH=$WD/mlflow_runs 6 | 7 | read -p "Executing rm -rf ${MLFLOW_RUNS_PATH}/* - (y/n)?" choice 8 | case "$choice" in 9 | y | Y) 10 | rm -rf "${MLFLOW_RUNS_PATH:?}"/* 11 | rm -rf "${MLFLOW_RUNS_PATH:?}"/.trash 12 | ;; 13 | *) 14 | echo "Skipping" 15 | ;; 16 | esac 17 | 18 | read -p "Executing rm -rf ${VENVS_PATH}/{launcher & mlproject & spark_inference}/* - (y/n)?" choice 19 | case "$choice" in 20 | y | Y) 21 | rm -rf "${VENVS_PATH:?}"/launcher/* 22 | rm -rf "${VENVS_PATH:?}"/mlproject/* 23 | rm -rf "${VENVS_PATH:?}"/spark_inference/* 24 | ;; 25 | *) 26 | echo "Skipping" 27 | ;; 28 | esac 29 | 30 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/create_vEnv_mlproject_command.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | CONDA_YAML_PATH=$WD/mlproject/conda.yaml 5 | VENV_PATH=$WD/venvs/mlproject 6 | 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}" 8 | 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH" 10 | 11 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/create_vEnv_spark_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | CONDA_YAML_PATH=$WD/spark_inference/conda.yaml 5 | VENV_PATH=$WD/venvs/spark_inference 6 | 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}" 8 | 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH" 10 | 11 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/data/train.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/doc/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/doc/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/launcher_mlproject_command.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mlflow 3 | from mlflow.tracking import MlflowClient 4 | 5 | # TODO - Importing python script ~ MLproject file: command: "python train.py --train-data..." 6 | from mlproject import train 7 | 8 | # -------------------------------------------------------------------------------- 9 | # Note: This python script must be launched using the virtual environment 10 | # defined in MLproject through conda.yaml file: 11 | # 12 | # Note: This v.env has been pre-created at venvs/mlproject 13 | # -------------------------------------------------------------------------------- 14 | 15 | # Current directory 16 | wd = os.path.abspath(os.path.dirname(__file__)) 17 | 18 | # Reading data ~ Rocket setup 19 | input_csv = os.path.join(wd, 'data', 'train.csv') 20 | 21 | # Creating experiment ~ Mlflow launcher 22 | mlflow.set_tracking_uri("file://{}".format(os.path.join(wd, 'mlflow_runs/'))) 23 | client = MlflowClient( 24 | tracking_uri="file://{}".format(os.path.join(wd, 'mlflow_runs')) 25 | ) 26 | experiment_id = "local" 27 | if not client.get_experiment_by_name(experiment_name): 28 | client.create_experiment(experiment_name) 29 | 30 | # Executing command defined in MLproject file ~ Mlflow launcher 31 | train.main( 32 | [ 33 | "--training-data={}".format(input_csv), 34 | # TODO - all command line arguments that accept python script used in MLproject entrypoint/command 35 | ... 36 | ] 37 | ) 38 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/launcher_spark_inference.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # => Working directory 5 | wd = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | # => Setting spark environment ~ Rocket integration 8 | if not os.getenv('SPARK_HOME'): 9 | os.environ['SPARK_HOME'] = "XXXXXXXXXXX" 10 | spark_home = os.environ.get('SPARK_HOME', None) 11 | # Add pyspark and py4j to path. 12 | sys.path.insert(0, spark_home + "/python") 13 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.9-src.zip')) 14 | 15 | from pyspark.sql import SparkSession 16 | import mlflow 17 | 18 | # => Creating a pyspark session ~ Rocket integration 19 | spark = SparkSession.builder.master("local[*]")\ 20 | .appName("Debugging Spark-Mlflow integration") \ 21 | .config("spark.sql.execution.arrow.pyspark.enabled", "true") \ 22 | .getOrCreate() 23 | 24 | # => Reading data ~ Rocket integration 25 | df = spark.read.csv( 26 | path=os.path.join(wd, 'data', 'train.csv'), 27 | header=True, 28 | inferSchema=True 29 | ) 30 | 31 | # => Mlflow logged model path ~ Rocket integration 32 | modelDirPath = os.path.join(wd, 'spark_inference', 'model') 33 | # · Loading model 34 | loaded_model = mlflow.pyfunc.load_model(modelDirPath) 35 | 36 | 37 | # => Constructing UDF ~ Rocket integration 38 | # · We need input features, output column name and output column type 39 | features = None 40 | output_spark_schema = None 41 | 42 | # Try to use model signature to infer this parameters 43 | if loaded_model.metadata.signature: 44 | # Input features 45 | input_signature = loaded_model.metadata.signature.inputs 46 | features = [s.name for s in input_signature.inputs] 47 | # Output column name & type 48 | output_signature = loaded_model.metadata.signature.outputs 49 | output_spark_schema = output_signature.as_spark_schema() 50 | 51 | 52 | # · Input features 53 | if not features: 54 | features = ["XXXXXXXXXX"] # Must be defined manually if your logged model do not incorporate signature 55 | print("Input features for UDF: {}".format(features)) 56 | 57 | # · Output column name & type 58 | if not output_spark_schema: 59 | predictionColumnName = "XXXXXX" # Must be defined manually if your logged model do not incorporate signature 60 | predictionColumnType = "XXXXXX" # Must be defined manually if your logged model do not incorporate signature 61 | else: 62 | print("Spark schema: {}".format(output_spark_schema)) 63 | predictionColumnName = output_spark_schema[0].name 64 | predictionColumnType = output_spark_schema[0].dataType 65 | 66 | print("Prediction column name for UDF: {}".format(predictionColumnName)) 67 | print("Prediction column type for UDF: {}".format(predictionColumnType)) 68 | 69 | prediction_udf = mlflow.pyfunc.spark_udf(spark, modelDirPath, result_type=predictionColumnType) 70 | 71 | # => Making predictions ~ Rocket integration 72 | predictionDf = df.withColumn(predictionColumnName, prediction_udf(*features)) 73 | predictionDf.show() 74 | 75 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/mlflow_runs/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/mlflow_runs/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/MLproject: -------------------------------------------------------------------------------- 1 | name: XXXXXXXX 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | training_data: string 9 | 10 | command: python train.py --training-data={training_data} -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/__init__.py -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/conda.yaml: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------- 2 | # Definition file for a python Conda managed virtual environment 3 | # ----------------------------------------------------------------- 4 | 5 | # V.env name 6 | name: XXXXXXXXXX 7 | 8 | # Conda channels (repositories) used to retrieve python packages 9 | channels: 10 | - defaults 11 | 12 | # Dependencies 13 | dependencies: 14 | # From conda repositories (channels) 15 | - python=3.7.6 16 | - pip=20.2.2 # <-- pip: python package manager 17 | - pip: 18 | # Python packages managed with Pip instead of Conda 19 | - mlflow==1.15.0 -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/mlproject/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import pandas as pd 4 | 5 | 6 | def parse_args(argv): 7 | """ 8 | Parses python command line input arguments (defined in MLproject file at command section) 9 | """ 10 | parser = argparse.ArgumentParser(description='Mlflow example') 11 | parser.add_argument('--training-data', type=str, help='training data set in csv') 12 | return parser.parse_args(argv) 13 | 14 | 15 | def main(argv): 16 | """ Data """ 17 | args = parse_args(argv) # mandatory 18 | pd_data = pd.read_csv(args.training_data) # mandatory 19 | 20 | """ Tracking """ 21 | with mlflow.start_run() as run: 22 | print('MFlown run {}'.format(run.info)) 23 | 24 | 25 | if __name__ == '__main__': 26 | main(sys.argv[1:]) 27 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/conda.yaml: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------------------------- 2 | # Python V. env to be used in pySpark in order to make predictions with a python_flavour MlFlow model 3 | # 4 | # Example: 5 | # name: mlflow-env 6 | # channels: 7 | # - defaults 8 | # - conda-forge 9 | # dependencies: 10 | # - python=3.7.6 11 | # - pip=20.2.2 12 | # - pip: 13 | # - mlflow==1.15.0 14 | # - scikit-learn==0.22.1 15 | # - cloudpickle==2.0.0 16 | # - pyarrow==5.0.0 17 | # 18 | # Note: 19 | # · Pre-requisite: binary distribution of Spark (SPARK_HOME env. var pointing to spark directory) 20 | # · pyspark and py4j dependencies are provided at runtime (getting them from Spark binary distribution) 21 | # · pyarrow is necessary in order to use Pandas UDF in Spark --> Mlflow do not include in it MLModel conda.yaml 22 | # ----------------------------------------------------------------------------------------------------------------- 23 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/model/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/spark_inference/model/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/utils/install_conda.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # · Install conda 4.8.3 4 | CONDA_DIR=/opt/conda 5 | 6 | curl -LO http://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh 7 | mkdir -p $CONDA_DIR 8 | bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -f -b -p $CONDA_DIR 9 | rm Miniconda3-py37_4.8.3-Linux-x86_64.sh 10 | conda install --quiet --yes conda==4.8.3 11 | 12 | # · Configuring conda 13 | conda config --system --set auto_update_conda false 14 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/venvs/launcher/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/launcher/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/venvs/mlproject/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/mlproject/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-archetype/venvs/spark_inference/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-archetype/venvs/spark_inference/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/__init__.py -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/clean_venvs_mlflow_runs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | VENVS_PATH=$WD/venvs 5 | MLFLOW_RUNS_PATH=$WD/mlflow_runs 6 | 7 | read -p "Executing rm -rf ${MLFLOW_RUNS_PATH}/* - (y/n)?" choice 8 | case "$choice" in 9 | y | Y) 10 | rm -rf "${MLFLOW_RUNS_PATH:?}"/* 11 | rm -rf "${MLFLOW_RUNS_PATH:?}"/.trash 12 | ;; 13 | *) 14 | echo "Skipping" 15 | ;; 16 | esac 17 | 18 | read -p "Executing rm -rf ${VENVS_PATH}/{launcher & mlproject & spark_inference}/* - (y/n)?" choice 19 | case "$choice" in 20 | y | Y) 21 | rm -rf "${VENVS_PATH:?}"/launcher/* 22 | rm -rf "${VENVS_PATH:?}"/mlproject/* 23 | rm -rf "${VENVS_PATH:?}"/spark_inference/* 24 | ;; 25 | *) 26 | echo "Skipping" 27 | ;; 28 | esac 29 | 30 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/create_vEnv_mlproject_command.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | CONDA_YAML_PATH=$WD/mlproject/conda.yaml 5 | VENV_PATH=$WD/venvs/mlproject 6 | 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}" 8 | 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH" 10 | 11 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/create_vEnv_spark_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WD="$(dirname "$(readlink -f "$0")")" 4 | CONDA_YAML_PATH=$WD/spark_inference/conda.yaml 5 | VENV_PATH=$WD/venvs/spark_inference 6 | 7 | echo "Creating conda environment defined in ${CONDA_YAML_PATH} at ${VENV_PATH}" 8 | 9 | conda env create -f "$CONDA_YAML_PATH" --prefix "$VENV_PATH" 10 | 11 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/data/train.csv: -------------------------------------------------------------------------------- 1 | class 2 | a 3 | b 4 | d 5 | d -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/doc/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/doc/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/launcher_mlproject_command.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mlflow 3 | from mlflow.tracking import MlflowClient 4 | from mlproject import train 5 | 6 | # -------------------------------------------------------------------------------- 7 | # Note: This python script must be launched using the virtual environment 8 | # defined in MLproject through conda.yaml file: 9 | # 10 | # Note: This v.env has been pre-created at venvs/mlproject 11 | # -------------------------------------------------------------------------------- 12 | 13 | # Current directory 14 | wd = os.path.abspath(os.path.dirname(__file__)) 15 | 16 | # Reading data ~ Rocket setup 17 | input_csv = os.path.join(wd, 'data', 'train.csv') 18 | 19 | # Creating experiment ~ Mlflow launcher 20 | mlflow.set_tracking_uri("file://{}".format(os.path.join(wd, 'mlflow_runs/'))) 21 | client = MlflowClient( 22 | tracking_uri="file://{}".format(os.path.join(wd, 'mlflow_runs')) 23 | ) 24 | experiment_name = "local" 25 | if not client.get_experiment_by_name(experiment_name): 26 | client.create_experiment(experiment_name) 27 | 28 | 29 | # Executing command defined in MLproject file ~ Mlflow launcher 30 | train.main( 31 | [ 32 | "--training_data={}".format(input_csv), 33 | "--feature_column_name={}".format("class"), 34 | "--prediction_column_name={}".format("prediction") 35 | ] 36 | ) 37 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/launcher_spark_inference.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # => Working directory 5 | wd = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | # => Setting spark environment ~ Rocket integration 8 | if not os.getenv('SPARK_HOME'): 9 | os.environ['SPARK_HOME'] = "/home/asoriano/workspace/software/stratio-spark-distribution-3.1.1-1.2.0-766b881-bin" 10 | spark_home = os.environ.get('SPARK_HOME', None) 11 | # Add pyspark and py4j to path. 12 | sys.path.insert(0, spark_home + "/python") 13 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.9-src.zip')) 14 | 15 | from pyspark.sql import SparkSession 16 | import mlflow 17 | 18 | # => Creating a pyspark session ~ Rocket integration 19 | spark = SparkSession.builder.master("local[*]")\ 20 | .appName("Debugging Spark-Mlflow integration") \ 21 | .config("spark.sql.execution.arrow.pyspark.enabled", "true") \ 22 | .getOrCreate() 23 | 24 | # => Reading data ~ Rocket integration 25 | df = spark.read.csv( 26 | path=os.path.join(wd, 'data', 'train.csv'), 27 | header=True, 28 | inferSchema=True 29 | ) 30 | 31 | # => Mlflow logged model path ~ Rocket integration 32 | modelDirPath = os.path.join(wd, 'spark_inference', 'model') 33 | # · Loading model 34 | loaded_model = mlflow.pyfunc.load_model(modelDirPath) 35 | 36 | 37 | # => Constructing UDF ~ Rocket integration 38 | # · We need input features, output column name and output column type 39 | features = None 40 | output_spark_schema = None 41 | 42 | # Try to use model signature to infer this parameters 43 | if loaded_model.metadata.signature: 44 | # Input features 45 | input_signature = loaded_model.metadata.signature.inputs 46 | features = [s.name for s in input_signature.inputs] 47 | # Output column name & type 48 | output_signature = loaded_model.metadata.signature.outputs 49 | output_spark_schema = output_signature.as_spark_schema() 50 | 51 | 52 | # · Input features 53 | if not features: 54 | features = [] 55 | print("Input features for UDF: {}".format(features)) 56 | 57 | # · Output column name & type 58 | if not output_spark_schema: 59 | predictionColumnName = "prediction" 60 | predictionColumnType = "string" 61 | else: 62 | print("Spark schema: {}".format(output_spark_schema)) 63 | predictionColumnName = output_spark_schema[0].name 64 | predictionColumnType = output_spark_schema[0].dataType 65 | 66 | print("Prediction column name for UDF: {}".format(predictionColumnName)) 67 | print("Prediction column type for UDF: {}".format(predictionColumnType)) 68 | 69 | prediction_udf = mlflow.pyfunc.spark_udf(spark, modelDirPath, result_type=predictionColumnType) 70 | 71 | # => Making predictions ~ Rocket integration 72 | predictionDf = df.withColumn(predictionColumnName, prediction_udf(*features)) 73 | predictionDf.show() 74 | 75 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/mlflow_runs/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/mlflow_runs/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/MLproject: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | training_data: string 9 | feature_column_name: {type: string, default: class} 10 | prediction_column_name: {type: string, default: prediction} 11 | 12 | output_column_name: {type: string, default: prediction} 13 | output_column_type: {type: string, default: string } 14 | 15 | command: python train.py --training_data={training_data} --feature_column_name={feature_column_name} --prediction_column_name={prediction_column_name} -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/__init__.py -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/conda.yaml: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------- 2 | # Definition file for a python Conda managed virtual environment 3 | # ----------------------------------------------------------------- 4 | 5 | # V.env name 6 | name: test 7 | 8 | # Conda channels (repositories) used to retrieve python packages 9 | channels: 10 | - defaults 11 | 12 | # Dependencies 13 | dependencies: 14 | # From conda repositories (channels) 15 | - python=3.7.6 16 | - pip=20.2.2 # <-- pip: python package manager 17 | - pip: 18 | # Python packages managed with Pip instead of Conda 19 | - mlflow==1.15.0 -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/mlproject/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import pandas as pd 4 | import numpy as np 5 | import mlflow.pyfunc 6 | from mlflow.models.signature import infer_signature 7 | from mlflow.pyfunc import PythonModel 8 | 9 | 10 | def parse_args(argv): 11 | """ 12 | Parses python command line input arguments (defined in MLproject file at command section) 13 | """ 14 | parser = argparse.ArgumentParser(description='Mlflow example') 15 | parser.add_argument('--training_data', type=str, help='training data set in csv') 16 | parser.add_argument('--feature_column_name', type=str, help='') 17 | parser.add_argument('--prediction_column_name', type=str, help='') 18 | return parser.parse_args(argv) 19 | 20 | 21 | class CustomModel(PythonModel): 22 | 23 | def __init__(self, feature_col_name, prediction_col_name): 24 | self.feature_col_name = feature_col_name 25 | self.prediction_col_name = prediction_col_name 26 | 27 | def dummy_func(self, x): 28 | return "Dummy code - {}".format(str(x)) 29 | 30 | def predict(self, context, model_input): 31 | if isinstance(model_input, pd.DataFrame): 32 | return pd.DataFrame( 33 | np.vectorize(self.dummy_func)(model_input[self.feature_col_name]), columns=[self.prediction_col_name] 34 | ) 35 | else: 36 | raise TypeError("Only DataFrame input types are supported") 37 | 38 | 39 | def main(argv): 40 | """ Data """ 41 | args = parse_args(argv) 42 | pd_data = pd.read_csv(args.training_data) 43 | 44 | # Features 45 | X_train = pd_data[[args.feature_column_name]] 46 | 47 | """ Model """ 48 | model = CustomModel(args.feature_column_name, args.prediction_column_name) 49 | 50 | # Predictions 51 | y_pred = model.predict({}, X_train) 52 | 53 | signature = infer_signature(X_train, y_pred) 54 | print("Signature: {}".format(signature)) 55 | # print("Input as spark schema: {}".format(signature.inputs.as_spark_schema())) 56 | # print("Output as spark schema: {}".format(signature.outputs.as_spark_schema())) 57 | 58 | """ Tracking """ 59 | with mlflow.start_run() as run: 60 | print('MFlown run {}'.format(run.info)) 61 | mlflow.pyfunc.log_model("model", python_model=model, signature=signature) 62 | 63 | 64 | if __name__ == '__main__': 65 | main(sys.argv[1:]) 66 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/conda.yaml: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------------------------- 2 | # Python V. env to be used in pySpark in order to make predictions with a python_flavour MlFlow model 3 | # 4 | # Example: 5 | # name: mlflow-env 6 | # channels: 7 | # - defaults 8 | # - conda-forge 9 | # dependencies: 10 | # - python=3.7.6 11 | # - pip=20.2.2 12 | # - pip: 13 | # - mlflow==1.15.0 14 | # - scikit-learn==0.22.1 15 | # - cloudpickle==2.0.0 16 | # - pyarrow==5.0.0 17 | # 18 | # Note: 19 | # · Pre-requisite: binary distribution of Spark (SPARK_HOME env. var pointing to spark directory) 20 | # · pyspark and py4j dependencies are provided at runtime (getting them from Spark binary distribution) 21 | # · pyarrow is necessary in order to use Pandas UDF in Spark --> Mlflow do not include in it MLModel conda.yaml 22 | # ----------------------------------------------------------------------------------------------------------------- 23 | 24 | name: mlflow-env 25 | 26 | channels: 27 | - defaults 28 | - conda-forge 29 | 30 | dependencies: 31 | - python=3.7.6 32 | - pip=20.2.2 33 | - pip: 34 | - mlflow==1.15.0 35 | - cloudpickle==2.0.0 36 | - pyarrow==5.0.0 -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | cloudpickle_version: 2.0.0 5 | env: conda.yaml 6 | loader_module: mlflow.pyfunc.model 7 | python_model: python_model.pkl 8 | python_version: 3.7.6 9 | run_id: c874e2ead8864acaa377d6252e03277b 10 | signature: 11 | inputs: '[{"name": "class", "type": "string"}]' 12 | outputs: '[{"name": "prediction", "type": "string"}]' 13 | utc_time_created: '2021-10-27 08:13:47.929231' 14 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - pip 7 | - pip: 8 | - mlflow 9 | - cloudpickle==2.0.0 10 | name: mlflow-env 11 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/python_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/spark_inference/model/python_model.pkl -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/utils/install_conda.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # · Install conda 4.8.3 4 | CONDA_DIR=/opt/conda 5 | 6 | curl -LO http://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh 7 | mkdir -p $CONDA_DIR 8 | bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -f -b -p $CONDA_DIR 9 | rm Miniconda3-py37_4.8.3-Linux-x86_64.sh 10 | conda install --quiet --yes conda==4.8.3 11 | 12 | # · Configuring conda 13 | conda config --system --set auto_update_conda false 14 | -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/launcher/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/launcher/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/mlproject/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/mlproject/.keep -------------------------------------------------------------------------------- /mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/spark_inference/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/mlflow_mlprojects/mlflow-devEnv-custom-model/venvs/spark_inference/.keep -------------------------------------------------------------------------------- /rocket-extensions/Readme.md: -------------------------------------------------------------------------------- 1 | # Rocket extensions 2 | 3 | ## UDF 4 | 5 | // TODO 6 | 7 | ## Plugins 8 | 9 | - Legacy versions (Documentation in branches older than 9.5 links it) 10 | - /old-extensions/input-lite-xd 11 | - /old-extensions/output-lite-xd 12 | - /old-extensions/transformation-lite-xd 13 | 14 | **Note: plugins generated with new SDK versions must be included in new folders:** 15 | 16 | - rocket-1.0.0-SDK 17 | - Custom Input: GeneratorXDLiteInputStepBatch & GeneratorXDLiteInputStepStreaming 18 | - Custom Transform: RepartitionXDLiteTransformStepBatch & RepartitionXDLiteTransformStepStreaming 19 | - Custom Transform: TokenizerTransformStepBatch & TokenizerTransformStepStreaming 20 | - Custom Output: LoggerXDLiteOutputStep 21 | 22 | - rocket-1.1.0-SDK 23 | - New functionality: metadata management 24 | - New step: MetadataTestXDLiteInputStepBatch 25 | 26 | - rocket-2.2.0-SDK 27 | - New functionality: Execution report logs in custom steps 28 | - New steps: ReportLogTestXDLiteInputStepBatch & ReportLogTestXDLiteInputStepStreaming 29 | - Added reporting in: 30 | - GeneratorXDLiteInputStepBatch & GeneratorXDLiteInputStepStreaming 31 | - TokenizerTransformStepBatch & TokenizerTransformStepStreaming 32 | - LoggerXDLiteOutputStep 33 | 34 | - rocket-2.3.0-SDK 35 | - New functionality: Lineage and QRs definition in custom steps 36 | - New steps: JdbcWithLineageXDLiteOutputStep 37 | 38 | - rocket-3.0.0-SDK 39 | - New functionality: Hybrid custom steps 40 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | ResultBatchData(defaultRDD, Option(stringSchema)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.functions._ 12 | import com.stratio.sparta.sdk.lite.xd.streaming._ 13 | import com.stratio.sparta.sdk.lite.streaming.models._ 14 | import org.apache.spark.sql.crossdata.XDSession 15 | 16 | import scala.util.{Failure, Success, Try} 17 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 18 | import org.apache.spark.sql.types._ 19 | import org.apache.spark.streaming.StreamingContext 20 | 21 | import scala.collection.mutable 22 | 23 | class GeneratorXDLiteInputStepStreaming( 24 | xdSession: XDSession, 25 | streamingContext: StreamingContext, 26 | properties: Map[String, String] 27 | ) 28 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 29 | 30 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 31 | 32 | override def init(): ResultStreamingData = { 33 | val dataQueue = new mutable.Queue[RDD[Row]]() 34 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 35 | dataQueue += xdSession.sparkContext.parallelize(register) 36 | val stream = streamingContext.queueStream(dataQueue) 37 | 38 | ResultStreamingData(stream, Option(stringSchema)) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | logger.info(s"Table name: $tableName") 36 | logger.info(s"Save mode is set to ${outputOptions.saveMode}") 37 | } 38 | data.foreach{ row => 39 | println(row.mkString(",")) 40 | } 41 | } 42 | 43 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 44 | } 45 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package com.stratio.sparta 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.catalyst.ScalaReflection 11 | import org.apache.spark.sql.expressions.UserDefinedFunction 12 | import org.apache.spark.sql.types.StringType 13 | 14 | import scala.util.Try 15 | 16 | case class ToUpperCaseUDF() extends SpartaUDF { 17 | 18 | val name = "uppercaseSparta" 19 | 20 | val upper: String => String = _.toUpperCase 21 | 22 | val userDefinedFunction: UserDefinedFunction = 23 | UserDefinedFunction(upper , StringType, Option(Seq(StringType))) 24 | } 25 | 26 | case class ConcatUDF() extends SpartaUDF { 27 | 28 | val name = "concatSparta" 29 | 30 | val upper: (String, String) => String = { case (str1, str2) => 31 | s"$str1/$str2" 32 | } 33 | 34 | val userDefinedFunction: UserDefinedFunction = 35 | UserDefinedFunction(upper , StringType, Option(Seq(StringType, StringType))) 36 | } 37 | 38 | case class ToUpperCaseWithReflectionUDF() extends SpartaUDF { 39 | 40 | val name = "upperCaseReflect" 41 | 42 | val upper: String => String = _.toUpperCase 43 | 44 | val userDefinedFunction: UserDefinedFunction = { 45 | val inputTypes = Try(ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType :: Nil).toOption 46 | UserDefinedFunction(upper , ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType, inputTypes) 47 | } 48 | } -------------------------------------------------------------------------------- /rocket-extensions/old-extensions/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.sql.sparta.udf 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.sql.types.{DoubleType, IntegerType} 12 | import org.apache.spark.ml.linalg.VectorUDT 13 | 14 | case object VectorUDT extends VectorUDT 15 | 16 | case class GetDenseVectorUDF() extends SpartaUDF { 17 | 18 | val name = "get_vector_ith_element" 19 | 20 | val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num) 21 | 22 | val userDefinedFunction: UserDefinedFunction = 23 | UserDefinedFunction(getVectorElement , DoubleType, Option(Seq(VectorUDT, IntegerType))) 24 | } 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | ResultBatchData(defaultRDD, Option(stringSchema)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.functions._ 12 | import com.stratio.sparta.sdk.lite.xd.streaming._ 13 | import com.stratio.sparta.sdk.lite.streaming.models._ 14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 15 | import org.apache.spark.sql.crossdata.XDSession 16 | 17 | import scala.util.{Failure, Success, Try} 18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.streaming.StreamingContext 21 | 22 | import scala.collection.mutable 23 | 24 | class GeneratorXDLiteInputStepStreaming( 25 | xdSession: XDSession, 26 | streamingContext: StreamingContext, 27 | properties: Map[String, String] 28 | ) 29 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 30 | 31 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 32 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 33 | 34 | 35 | override def validate(): ValidationResult = { 36 | var validation = ValidationResult(valid = true, messages = Seq.empty) 37 | 38 | if (rawData.isEmpty) { 39 | validation = ValidationResult( 40 | valid = false, 41 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 42 | } 43 | 44 | if (rawData.map(_.trim).forall(_.isEmpty)) { 45 | validation = ValidationResult( 46 | valid = false, 47 | messages = validation.messages :+ "Generated data cannot be an empty string") 48 | } 49 | validation 50 | } 51 | 52 | override def init(): ResultStreamingData = { 53 | val dataQueue = new mutable.Queue[RDD[Row]]() 54 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 55 | dataQueue += xdSession.sparkContext.parallelize(register) 56 | val stream = streamingContext.queueStream(dataQueue) 57 | 58 | ResultStreamingData(stream, Option(stringSchema)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | logger.info(s"Table name: $tableName") 36 | logger.info(s"Save mode is set to ${outputOptions.saveMode}") 37 | } 38 | data.foreach{ row => 39 | println(row.mkString(",")) 40 | } 41 | } 42 | 43 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 44 | } 45 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-input-xd-1.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-input-xd-1.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-output-xd-1.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-output-xd-1.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-1.0.0-SDK/packaged-jars/custom-lite-transformation-xd-1.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.0.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.1.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | ResultBatchData(defaultRDD, Option(stringSchema)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-1.1.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.functions._ 12 | import com.stratio.sparta.sdk.lite.xd.streaming._ 13 | import com.stratio.sparta.sdk.lite.streaming.models._ 14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 15 | import org.apache.spark.sql.crossdata.XDSession 16 | 17 | import scala.util.{Failure, Success, Try} 18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.streaming.StreamingContext 21 | 22 | import scala.collection.mutable 23 | 24 | class GeneratorXDLiteInputStepStreaming( 25 | xdSession: XDSession, 26 | streamingContext: StreamingContext, 27 | properties: Map[String, String] 28 | ) 29 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 30 | 31 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 32 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 33 | 34 | 35 | override def validate(): ValidationResult = { 36 | var validation = ValidationResult(valid = true, messages = Seq.empty) 37 | 38 | if (rawData.isEmpty) { 39 | validation = ValidationResult( 40 | valid = false, 41 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 42 | } 43 | 44 | if (rawData.map(_.trim).forall(_.isEmpty)) { 45 | validation = ValidationResult( 46 | valid = false, 47 | messages = validation.messages :+ "Generated data cannot be an empty string") 48 | } 49 | validation 50 | } 51 | 52 | override def init(): ResultStreamingData = { 53 | val dataQueue = new mutable.Queue[RDD[Row]]() 54 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 55 | dataQueue += xdSession.sparkContext.parallelize(register) 56 | val stream = streamingContext.queueStream(dataQueue) 57 | 58 | ResultStreamingData(stream, Option(stringSchema)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | // · Reporting messages 48 | reportInfoLog(phase="init", s"Generated data: $register") 49 | 50 | ResultBatchData(defaultRDD, Option(stringSchema)) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.functions._ 12 | import com.stratio.sparta.sdk.lite.xd.streaming._ 13 | import com.stratio.sparta.sdk.lite.streaming.models._ 14 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 15 | import org.apache.spark.sql.crossdata.XDSession 16 | 17 | import scala.util.{Failure, Success, Try} 18 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.streaming.StreamingContext 21 | 22 | import scala.collection.mutable 23 | 24 | class GeneratorXDLiteInputStepStreaming( 25 | xdSession: XDSession, 26 | streamingContext: StreamingContext, 27 | properties: Map[String, String] 28 | ) 29 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 30 | 31 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 32 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 33 | 34 | override def validate(): ValidationResult = { 35 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 36 | 37 | if (rawData.isEmpty) { 38 | validation = ValidationResult( 39 | valid = false, 40 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 41 | } 42 | 43 | if (rawData.map(_.trim).forall(_.isEmpty)) { 44 | validation = ValidationResult( 45 | valid = false, 46 | messages = validation.messages :+ "Generated data cannot be an empty string") 47 | } 48 | validation 49 | } 50 | 51 | override def init(): ResultStreamingData = { 52 | val dataQueue = new mutable.Queue[RDD[Row]]() 53 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 54 | dataQueue += xdSession.sparkContext.parallelize(register) 55 | val stream = streamingContext.queueStream(dataQueue) 56 | 57 | // · Reporting messages 58 | reportInfoLog(phase="init", s"Generated data: $register") 59 | 60 | ResultStreamingData(stream, Option(stringSchema)) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class ReportLogTestXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | // · Reporting messages 46 | reportInfoLog(phase="init", s"Generated data: $register") 47 | reportWarnLog(phase="init", s"Generated data: $register") 48 | reportErrorLog(phase="init", s"Generated data: $register") 49 | 50 | val defaultRDD = xdSession.sparkContext.parallelize(register) 51 | 52 | ResultBatchData(defaultRDD, Option(stringSchema)) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.streaming._ 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql._ 13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 14 | import org.apache.spark.sql.crossdata.XDSession 15 | import org.apache.spark.sql.types._ 16 | import org.apache.spark.streaming.StreamingContext 17 | 18 | import scala.collection.mutable 19 | 20 | class ReportLogTestXDLiteInputStepStreaming( 21 | xdSession: XDSession, 22 | streamingContext: StreamingContext, 23 | properties: Map[String, String] 24 | ) 25 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 26 | 27 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 28 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 29 | 30 | override def validate(): ValidationResult = { 31 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 32 | 33 | if (rawData.isEmpty) { 34 | validation = ValidationResult( 35 | valid = false, 36 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 37 | } 38 | 39 | if (rawData.map(_.trim).forall(_.isEmpty)) { 40 | validation = ValidationResult( 41 | valid = false, 42 | messages = validation.messages :+ "Generated data cannot be an empty string") 43 | } 44 | validation 45 | } 46 | 47 | override def init(): ResultStreamingData = { 48 | val dataQueue = new mutable.Queue[RDD[Row]]() 49 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 50 | dataQueue += xdSession.sparkContext.parallelize(register) 51 | val stream = streamingContext.queueStream(dataQueue) 52 | 53 | // · Reporting messages 54 | reportInfoLog(phase="init", s"Generated data: $register") 55 | reportWarnLog(phase="init", s"Generated data: $register") 56 | reportErrorLog(phase="init", s"Generated data: $register") 57 | 58 | ResultStreamingData(stream, Option(stringSchema)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | val info1 = s"Table name: $tableName" 36 | logger.info(info1) 37 | // · Reporting messages 38 | reportInfoLog(phase="init", msg = info1) 39 | 40 | val info2 = s"Save mode is set to ${outputOptions.saveMode}" 41 | logger.info(info2) 42 | // · Reporting messages 43 | reportInfoLog(phase="init", msg = info2) 44 | } 45 | data.foreach{ row => 46 | println(row.mkString(",")) 47 | } 48 | } 49 | 50 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 51 | } 52 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-input-xd-2.2.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-input-xd-2.2.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-output-xd-2.2.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-output-xd-2.2.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-transformation-xd-2.2.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-udf-2.2.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-2.2.0-SDK/packaged-jars/custom-lite-udf-2.2.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.column 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.xd.batch._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.sql.functions.lit 12 | import org.apache.spark.sql.types.StructType 13 | 14 | class AddColumnXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | // Get input data and schema 21 | val inputStream = inputData.head._2.data 22 | val inputSchema = inputData.head._2.schema.getOrElse(new StructType()) 23 | 24 | // Convert to DataFrame and make modifications 25 | val df = xdSession.createDataFrame(inputStream, inputSchema) 26 | val dfWithColumn = df.withColumn("newCol", lit(2)) 27 | 28 | // Return the transformed data 29 | OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema)) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package com.stratio.sparta 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.sql.functions.udf 12 | 13 | case class ToUpperCaseUDF() extends SpartaUDF { 14 | 15 | val name = "uppercaseSparta" 16 | 17 | val upper: String => String = _.toUpperCase 18 | 19 | val userDefinedFunction: UserDefinedFunction = udf(upper) 20 | 21 | } 22 | 23 | case class ConcatUDF() extends SpartaUDF { 24 | 25 | val name = "concatSparta" 26 | 27 | val concat: (String, String) => String = { case (str1, str2) => 28 | s"$str1/$str2" 29 | } 30 | 31 | val userDefinedFunction: UserDefinedFunction = udf(concat) 32 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.2.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.sql.sparta.udf 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.ml.linalg.VectorUDT 12 | import org.apache.spark.sql.functions.udf 13 | 14 | case object VectorUDT extends VectorUDT 15 | 16 | case class GetDenseVectorUDF() extends SpartaUDF { 17 | 18 | val name = "get_vector_ith_element" 19 | 20 | val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num) 21 | 22 | val userDefinedFunction: UserDefinedFunction = udf(getVectorElement) 23 | } 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | // · Reporting messages 48 | reportInfoLog(phase="init", s"Generated data: $register") 49 | 50 | ResultBatchData(defaultRDD, Option(stringSchema)) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class ReportLogTestXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | // · Reporting messages 46 | reportInfoLog(phase="init", s"Generated data: $register") 47 | reportWarnLog(phase="init", s"Generated data: $register") 48 | reportErrorLog(phase="init", s"Generated data: $register") 49 | 50 | val defaultRDD = xdSession.sparkContext.parallelize(register) 51 | 52 | ResultBatchData(defaultRDD, Option(stringSchema)) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.streaming._ 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql._ 13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 14 | import org.apache.spark.sql.crossdata.XDSession 15 | import org.apache.spark.sql.types._ 16 | import org.apache.spark.streaming.StreamingContext 17 | 18 | import scala.collection.mutable 19 | 20 | class ReportLogTestXDLiteInputStepStreaming( 21 | xdSession: XDSession, 22 | streamingContext: StreamingContext, 23 | properties: Map[String, String] 24 | ) 25 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 26 | 27 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 28 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 29 | 30 | override def validate(): ValidationResult = { 31 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 32 | 33 | if (rawData.isEmpty) { 34 | validation = ValidationResult( 35 | valid = false, 36 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 37 | } 38 | 39 | if (rawData.map(_.trim).forall(_.isEmpty)) { 40 | validation = ValidationResult( 41 | valid = false, 42 | messages = validation.messages :+ "Generated data cannot be an empty string") 43 | } 44 | validation 45 | } 46 | 47 | override def init(): ResultStreamingData = { 48 | val dataQueue = new mutable.Queue[RDD[Row]]() 49 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 50 | dataQueue += xdSession.sparkContext.parallelize(register) 51 | val stream = streamingContext.queueStream(dataQueue) 52 | 53 | // · Reporting messages 54 | reportInfoLog(phase="init", s"Generated data: $register") 55 | reportWarnLog(phase="init", s"Generated data: $register") 56 | reportErrorLog(phase="init", s"Generated data: $register") 57 | 58 | ResultStreamingData(stream, Option(stringSchema)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/JdbcWithLineageXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import java.util.Properties 14 | import scala.util.{Failure, Success, Try} 15 | 16 | class JdbcWithLineageXDLiteOutputStep( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDOutput(xdSession, properties) { 21 | 22 | lazy val url = properties.getOrElse("url", throw new NoSuchElementException("The url property is mandatory")) 23 | 24 | // Lineage options, usually extracted from 'url' or other properties as 'dbtable' 25 | override def lineageService(): Option[String] = properties.get("service") 26 | override def lineagePath(): Option[String] = properties.get("path") 27 | override def lineageResource(): Option[String] = properties.get("resource") // If empty will be populated by the system the writer tableName 28 | override def lineageDatastoreType(): Option[String] = properties.get("datastoreType") 29 | 30 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 31 | val tableName = outputOptions.tableName.getOrElse{ 32 | logger.error("Table name not defined") 33 | throw new NoSuchElementException("tableName not found in options") 34 | } 35 | 36 | val jdbcProperties = new Properties() 37 | 38 | properties 39 | .filterKeys(key => key.startsWith("jdbc_") || key.equals("driver")) 40 | .foreach{ case (key, value) => jdbcProperties.put(key.replaceAll("jdbc_", ""), value) } 41 | 42 | logger.error(s"Connecting with table $tableName") 43 | logger.error(s"Connecting with properties $jdbcProperties") 44 | 45 | //data.write 46 | // .mode(SaveMode.Append) 47 | // .jdbc(url = url, table = tableName, connectionProperties = jdbcProperties) 48 | data.count() 49 | } 50 | 51 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 52 | } 53 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | val info1 = s"Table name: $tableName" 36 | logger.info(info1) 37 | // · Reporting messages 38 | reportInfoLog(phase="init", msg = info1) 39 | 40 | val info2 = s"Save mode is set to ${outputOptions.saveMode}" 41 | logger.info(info2) 42 | // · Reporting messages 43 | reportInfoLog(phase="init", msg = info2) 44 | } 45 | data.foreach{ row => 46 | println(row.mkString(",")) 47 | } 48 | } 49 | 50 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 51 | } 52 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.column 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.xd.batch._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.sql.functions.lit 12 | import org.apache.spark.sql.types.StructType 13 | 14 | class AddColumnXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | // Get input data and schema 21 | val inputStream = inputData.head._2.data 22 | val inputSchema = inputData.head._2.schema.getOrElse(new StructType()) 23 | 24 | // Convert to DataFrame and make modifications 25 | val df = xdSession.createDataFrame(inputStream, inputSchema) 26 | val dfWithColumn = df.withColumn("newCol", lit(2)) 27 | 28 | // Return the transformed data 29 | OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema)) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package com.stratio.sparta 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.sql.functions.udf 12 | 13 | case class ToUpperCaseUDF() extends SpartaUDF { 14 | 15 | val name = "uppercaseSparta" 16 | 17 | val upper: String => String = _.toUpperCase 18 | 19 | val userDefinedFunction: UserDefinedFunction = udf(upper) 20 | 21 | } 22 | 23 | case class ConcatUDF() extends SpartaUDF { 24 | 25 | val name = "concatSparta" 26 | 27 | val concat: (String, String) => String = { case (str1, str2) => 28 | s"$str1/$str2" 29 | } 30 | 31 | val userDefinedFunction: UserDefinedFunction = udf(concat) 32 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-2.3.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.sql.sparta.udf 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.ml.linalg.VectorUDT 12 | import org.apache.spark.sql.functions.udf 13 | 14 | case object VectorUDT extends VectorUDT 15 | 16 | case class GetDenseVectorUDF() extends SpartaUDF { 17 | 18 | val name = "get_vector_ith_element" 19 | 20 | val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num) 21 | 22 | val userDefinedFunction: UserDefinedFunction = udf(getVectorElement) 23 | } 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Inputs classes 4 | 5 | ### Batch 6 | 7 | com.stratio.sparta.GeneratorXDLiteInputStepBatch 8 | 9 | com.stratio.sparta.MetadataTestXDLiteInputStepBatch 10 | 11 | com.stratio.sparta.ReportLogTestXDLiteInputStepBatch 12 | 13 | ### Streaming 14 | 15 | com.stratio.sparta.GeneratorXDLiteInputStepStreaming 16 | 17 | com.stratio.sparta.ReportLogTestXDLiteInputStepStreaming 18 | 19 | ### Hybrid 20 | 21 | com.stratio.sparta.GeneratorXDLiteInputStepHybrid 22 | 23 | com.stratio.sparta.StreamGeneratorXDLiteInputStepHybrid 24 | 25 | 26 | ## Transformer classes 27 | 28 | ### Batch 29 | 30 | com.stratio.sparta.transformations.column.AddColumnXDLiteTransformStepBatch 31 | 32 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepBatch 33 | 34 | com.stratio.sparta.transformations.tokenizer.TokenizerTransformStepBatch 35 | 36 | ### Streaming 37 | 38 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepStreaming 39 | 40 | com.stratio.sparta.transformations.tokenizer.TokenizerTransformStepStreaming 41 | 42 | ### Hybrid 43 | 44 | com.stratio.sparta.transformations.column.AddColumnXDLiteTransformStepHybrid 45 | 46 | com.stratio.sparta.transformations.repartition.RepartitionXDLiteTransformStepHybrid 47 | 48 | 49 | ## Output classes 50 | 51 | com.stratio.sparta.LoggerXDLiteOutputStep 52 | 53 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | warnings = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | // · Reporting messages 48 | reportInfoLog(phase="init", s"Generated data: $register") 49 | 50 | ResultBatchData(defaultRDD, Option(stringSchema)) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepHybrid.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.hybrid.models.ResultHybridData 10 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 11 | import com.stratio.sparta.sdk.lite.xd.hybrid._ 12 | import org.apache.spark.sql._ 13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 14 | import org.apache.spark.sql.crossdata.XDSession 15 | import org.apache.spark.sql.types._ 16 | 17 | class GeneratorXDLiteInputStepHybrid( 18 | xdSession: XDSession, 19 | properties: Map[String, String] 20 | ) 21 | extends LiteCustomXDHybridInput(xdSession, properties) { 22 | 23 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 24 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 25 | 26 | 27 | override def validate(): ValidationResult = { 28 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 29 | 30 | if (rawData.isEmpty) { 31 | validation = ValidationResult( 32 | valid = false, 33 | warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 34 | } 35 | 36 | if (rawData.map(_.trim).forall(_.isEmpty)) { 37 | validation = ValidationResult( 38 | valid = false, 39 | warnings = validation.messages :+ "Generated data cannot be an empty string") 40 | } 41 | validation 42 | } 43 | 44 | override def init(): ResultHybridData = { 45 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 46 | val defaultRDD = xdSession.sparkContext.parallelize(register) 47 | val dataFrame = xdSession.createDataFrame(defaultRDD, stringSchema) 48 | 49 | // · Reporting messages 50 | reportInfoLog(phase="init", s"Generated data: $register") 51 | 52 | ResultHybridData(dataFrame) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class ReportLogTestXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | warnings = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 45 | // · Reporting messages 46 | reportInfoLog(phase="init", s"Generated data: $register") 47 | reportWarnLog(phase="init", s"Generated data: $register") 48 | reportErrorLog(phase="init", s"Generated data: $register") 49 | 50 | val defaultRDD = xdSession.sparkContext.parallelize(register) 51 | 52 | ResultBatchData(defaultRDD, Option(stringSchema)) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/ReportLogTestXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.streaming._ 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql._ 13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 14 | import org.apache.spark.sql.crossdata.XDSession 15 | import org.apache.spark.sql.types._ 16 | import org.apache.spark.streaming.StreamingContext 17 | 18 | import scala.collection.mutable 19 | 20 | class ReportLogTestXDLiteInputStepStreaming( 21 | xdSession: XDSession, 22 | streamingContext: StreamingContext, 23 | properties: Map[String, String] 24 | ) 25 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 26 | 27 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 28 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 29 | 30 | override def validate(): ValidationResult = { 31 | var validation = ValidationResult(valid = true, messages = Seq.empty, warnings = Seq.empty) 32 | 33 | if (rawData.isEmpty) { 34 | validation = ValidationResult( 35 | valid = false, 36 | warnings = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 37 | } 38 | 39 | if (rawData.map(_.trim).forall(_.isEmpty)) { 40 | validation = ValidationResult( 41 | valid = false, 42 | warnings = validation.messages :+ "Generated data cannot be an empty string") 43 | } 44 | validation 45 | } 46 | 47 | override def init(): ResultStreamingData = { 48 | val dataQueue = new mutable.Queue[RDD[Row]]() 49 | val register = Seq(new GenericRowWithSchema(Array(rawData.getOrElse("test-data")), stringSchema).asInstanceOf[Row]) 50 | dataQueue += xdSession.sparkContext.parallelize(register) 51 | val stream = streamingContext.queueStream(dataQueue) 52 | 53 | // · Reporting messages 54 | reportInfoLog(phase="init", s"Generated data: $register") 55 | reportWarnLog(phase="init", s"Generated data: $register") 56 | reportErrorLog(phase="init", s"Generated data: $register") 57 | 58 | ResultStreamingData(stream, Option(stringSchema)) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/input-lite-xd/src/main/scala/com/stratio/sparta/StreamGeneratorXDLiteInputStepHybrid.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.hybrid.models.ResultHybridData 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.hybrid._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 13 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 14 | import org.apache.spark.sql.crossdata.XDSession 15 | import org.apache.spark.sql.types._ 16 | 17 | class StreamGeneratorXDLiteInputStepHybrid( 18 | xdSession: XDSession, 19 | properties: Map[String, String] 20 | ) 21 | extends LiteCustomXDHybridInput(xdSession, properties) { 22 | 23 | lazy val rowsPerSecond: Option[String] = properties.get("rowsPerSecond").map(_.toString) 24 | 25 | override def init(): ResultHybridData = { 26 | val dataFrame: Dataset[Row] = xDSession.readStream 27 | .format("rate") 28 | .option("rowsPerSecond", rowsPerSecond.getOrElse("1")) 29 | .load() 30 | 31 | ResultHybridData(dataFrame) 32 | } 33 | 34 | // This method is used in order to provide an equivalent Batch Dataframe for debugging purposes 35 | def debugInit(): Option[DataFrame] = { 36 | import xdSession.implicits._ 37 | 38 | Option(Seq( 39 | (8, "Lazarillo de Tormes"), 40 | (64, "Codex Seraphinianus"), 41 | (27, "Divina Commedia") 42 | ).toDF("price", "book")) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/JdbcWithLineageXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import java.util.Properties 14 | import scala.util.{Failure, Success, Try} 15 | 16 | class JdbcWithLineageXDLiteOutputStep( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDOutput(xdSession, properties) { 21 | 22 | lazy val url = properties.getOrElse("url", throw new NoSuchElementException("The url property is mandatory")) 23 | 24 | 25 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 26 | val tableName = outputOptions.tableName.getOrElse{ 27 | logger.error("Table name not defined") 28 | throw new NoSuchElementException("tableName not found in options") 29 | } 30 | 31 | val jdbcProperties = new Properties() 32 | 33 | properties 34 | .filterKeys(key => key.startsWith("jdbc_") || key.equals("driver")) 35 | .foreach{ case (key, value) => jdbcProperties.put(key.replaceAll("jdbc_", ""), value) } 36 | 37 | logger.error(s"Connecting with table $tableName") 38 | logger.error(s"Connecting with properties $jdbcProperties") 39 | 40 | data.write 41 | .mode(SaveMode.Append) 42 | .jdbc(url = url, table = tableName, connectionProperties = jdbcProperties) 43 | } 44 | 45 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 46 | } 47 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | val info1 = s"Table name: $tableName" 36 | logger.info(info1) 37 | // · Reporting messages 38 | reportInfoLog(phase="init", msg = info1) 39 | 40 | val info2 = s"Save mode is set to ${outputOptions.saveMode}" 41 | logger.info(info2) 42 | // · Reporting messages 43 | reportInfoLog(phase="init", msg = info2) 44 | } 45 | data.foreach{ row => 46 | println(row.mkString(",")) 47 | } 48 | } 49 | 50 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 51 | } 52 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-input-xd-3.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-input-xd-3.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-output-xd-3.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-output-xd-3.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-extensions/rocket-3.0.0-SDK/packaged-jars/custom-lite-transformation-xd-3.0.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.column 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.xd.batch._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.sql.functions.lit 12 | import org.apache.spark.sql.types.StructType 13 | 14 | class AddColumnXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | // Get input data and schema 21 | val inputStream = inputData.head._2.data 22 | val inputSchema = inputData.head._2.schema.getOrElse(new StructType()) 23 | 24 | // Convert to DataFrame and make modifications 25 | val df = xdSession.createDataFrame(inputStream, inputSchema) 26 | val dfWithColumn = df.withColumn("newCol", lit(2)) 27 | 28 | // Return the transformed data 29 | OutputBatchTransformData(dfWithColumn.rdd, Option(dfWithColumn.schema)) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/column/AddColumnXDLiteTransformStepHybrid.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.column 7 | 8 | import com.stratio.sparta.sdk.lite.hybrid.models.{OutputHybridTransformData, ResultHybridData} 9 | import com.stratio.sparta.sdk.lite.xd.hybrid.LiteCustomXDHybridTransform 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.sql.functions.lit 12 | 13 | class AddColumnXDLiteTransformStepHybrid( 14 | xdSession: XDSession, 15 | properties: Map[String, String] 16 | ) extends LiteCustomXDHybridTransform(xdSession, properties) { 17 | 18 | override def transform(inputData: Map[String, ResultHybridData]): OutputHybridTransformData = { 19 | // Get input data and schema 20 | val inputStream = inputData.head._2.data 21 | 22 | val dfWithColumn = inputStream.withColumn("newCol", lit(2)) 23 | 24 | // Return the transformed data 25 | OutputHybridTransformData(dfWithColumn) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepHybrid.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.hybrid.models.{OutputHybridTransformData, ResultHybridData} 9 | import com.stratio.sparta.sdk.lite.xd.hybrid.LiteCustomXDHybridTransform 10 | import org.apache.spark.sql.crossdata.XDSession 11 | 12 | class RepartitionXDLiteTransformStepHybrid( 13 | xdSession: XDSession, 14 | properties: Map[String, String] 15 | ) extends LiteCustomXDHybridTransform(xdSession, properties) { 16 | 17 | override def transform(inputData: Map[String, ResultHybridData]): OutputHybridTransformData = { 18 | val inputStream = inputData.head._2.data 19 | 20 | OutputHybridTransformData(inputStream.repartition(5)) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package com.stratio.sparta 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.sql.functions.udf 12 | 13 | case class ToUpperCaseUDF() extends SpartaUDF { 14 | 15 | val name = "uppercaseSparta" 16 | 17 | val upper: String => String = _.toUpperCase 18 | 19 | val userDefinedFunction: UserDefinedFunction = udf(upper) 20 | 21 | } 22 | 23 | case class ConcatUDF() extends SpartaUDF { 24 | 25 | val name = "concatSparta" 26 | 27 | val concat: (String, String) => String = { case (str1, str2) => 28 | s"$str1/$str2" 29 | } 30 | 31 | val userDefinedFunction: UserDefinedFunction = udf(concat) 32 | } -------------------------------------------------------------------------------- /rocket-extensions/rocket-3.0.0-SDK/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.sql.sparta.udf 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.ml.linalg.VectorUDT 12 | import org.apache.spark.sql.functions.udf 13 | 14 | case object VectorUDT extends VectorUDT 15 | 16 | case class GetDenseVectorUDF() extends SpartaUDF { 17 | 18 | val name = "get_vector_ith_element" 19 | 20 | val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num) 21 | 22 | val userDefinedFunction: UserDefinedFunction = udf(getVectorElement) 23 | } 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /rocket-python-extensions/conda-pack-extensions/conda.yaml: -------------------------------------------------------------------------------- 1 | name: conda-pack-env 2 | 3 | channels: 4 | - conda-forge 5 | - nodefaults 6 | 7 | dependencies: 8 | - python=3.9.7 -------------------------------------------------------------------------------- /rocket-python-extensions/conda-pack-extensions/do_conda_pack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Parsing input parameters 8 | while [[ $# -gt 0 ]]; do 9 | case "$1" in 10 | -n) 11 | env_name="$2" 12 | ;; 13 | *) 14 | printf "***************************\n" 15 | printf "* Error: Invalid argument.*\n" 16 | printf "***************************\n" 17 | exit 1 18 | esac 19 | shift 20 | shift 21 | done 22 | 23 | env_name=${env_name:-"my_env"} 24 | echo "Conda environment name: $env_name" 25 | 26 | # Conda yaml path 27 | conda_yaml_path="$DIR/conda.yaml" 28 | 29 | # Creating target directories 30 | target_dir="$DIR/target" 31 | target_conda_dir="$target_dir/$env_name" 32 | mkdir $target_dir 33 | 34 | # Creating conda env 35 | conda env create -f $conda_yaml_path -p $target_conda_dir 36 | 37 | # Packaging conda env 38 | packaged_conda_env="$target_dir/$env_name.tar.gz" 39 | 40 | conda pack -p $target_conda_dir -o $packaged_conda_env 41 | 42 | # Removing conda env 43 | conda env remove -p $target_conda_dir -------------------------------------------------------------------------------- /rocket-python-extensions/private-pypi-repository/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Empaquetarlo en tar.gz 3 | 4 | python3 setup.py sdist 5 | 6 | # Usarlo en un workflow 7 | 8 | Conda.yaml 9 | 10 | ``` 11 | name: rocket-custom 12 | 13 | channels: 14 | - defaults 15 | 16 | dependencies: 17 | - python=3.7.6 18 | - pip=20.2.2 19 | - pip: 20 | - mlflow==1.15.0 21 | - pyspark==3.1.1 22 | - pyarrow==5.0.0 23 | - scikit-learn==0.22.1 24 | - rocket-python-examples==0.1.0 25 | ``` 26 | 27 | En un step de PySpark: 28 | 29 | ``` 30 | from pyspark.sql import * 31 | from pyspark.sql.functions import * 32 | from pyspark.sql.types import * 33 | 34 | def pyspark_transform(spark, df, param_dict): 35 | 36 | from rocket_python_examples.test import dummy_func 37 | 38 | convertUDF = udf(lambda z: dummy_func(z)) 39 | 40 | return df.withColumn("driverPython", lit(dummy_func("python"))).withColumn("executorPython", convertUDF(df["class"])) 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /rocket-python-extensions/private-pypi-repository/dist/rocket_python_examples-0.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/private-pypi-repository/dist/rocket_python_examples-0.1.0.tar.gz -------------------------------------------------------------------------------- /rocket-python-extensions/private-pypi-repository/rocket_python_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/private-pypi-repository/rocket_python_examples/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/private-pypi-repository/rocket_python_examples/test.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def dummy_func(x): 4 | return "Dummy code - {}".format(str(x)) 5 | -------------------------------------------------------------------------------- /rocket-python-extensions/private-pypi-repository/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # 3 | # © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 4 | # 5 | # This software – including all its source code – contains proprietary 6 | # information of Stratio Big Data Inc., Sucursal en España and 7 | # may not be revealed, sold, transferred, modified, distributed or 8 | # otherwise made available, licensed or sublicensed to third parties; 9 | # nor reverse engineered, disassembled or decompiled, without express 10 | # written authorization from Stratio Big Data Inc., Sucursal en España. 11 | # 12 | 13 | import os 14 | 15 | from setuptools import setup 16 | 17 | pjoin = os.path.join 18 | 19 | here = os.path.abspath(os.path.dirname(__file__)) 20 | 21 | packages = [] 22 | for d, _, _ in os.walk('rocket_python_examples'): 23 | if os.path.exists(pjoin(d, '__init__.py')): 24 | packages.append(d.replace(os.path.sep, '.')) 25 | 26 | 27 | def setup_package(): 28 | metadata = dict( 29 | name='rocket_python_examples', 30 | packages=packages, 31 | description="""Rocket python examples""", 32 | long_description="Stratio Rocket - python examples", 33 | author="Stratio Rocket", 34 | platforms="Linux", 35 | install_requires=[], 36 | version="0.1.0", 37 | keywords=['Rocket', 'Python'], 38 | classifiers=['Programming Language :: Python :: 3.7'], 39 | ) 40 | 41 | setup(**metadata) 42 | 43 | 44 | if __name__ == '__main__': 45 | setup_package() 46 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/make_packages.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | ./module_build_1/do_package.sh 8 | ./module_build_2/do_package.sh -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r user1_module.zip my_module 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/user1_module.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_1/my_module/user.py: -------------------------------------------------------------------------------- 1 | 2 | def user_func(x): 3 | return f"Hello '{x}' I'm user1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r user2_module.zip my_module 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/user2_module.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/module_build_2/my_module/user.py: -------------------------------------------------------------------------------- 1 | 2 | def user_func(x): 3 | return f"Hello '{x}' I'm user2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user1_module.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user1_module.zip -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user2_module.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/same_module_test/user2_module.zip -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/make_packages.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | ./test_pyfile_egg_from_hdfs/do_package.sh 8 | ./test_pyfile_egg_from_http/do_package.sh 9 | ./test_pyfile_zip_from_hdfs/do_package.sh 10 | ./test_pyfile_zip_from_http/do_package.sh -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging 8 | python3 setup.py sdist bdist_egg 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/dist/test_pyfile_egg_pkg_from_hdfs-0.1.0-py*.egg $DIR/../. 12 | 13 | # Remove folders created during packaging 14 | rm -rf build 15 | rm -rf dist 16 | rm -rf test_pyfile_egg_pkg_from_hdfs.egg-info -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='test_pyfile_egg_pkg_from_hdfs', 5 | version='0.1.0', 6 | description='A short description', 7 | long_description='A long description', 8 | author='Rocket', 9 | author_email='rocket@stratio.com', 10 | packages=find_packages(exclude=['*tests*']), 11 | ) 12 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/test_pyfile_egg.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_egg(x): 3 | return f"Hello '{x}' from egg pkg placed in Hdfs - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging 8 | python3 setup.py sdist bdist_egg 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/dist/test_pyfile_egg_pkg_from_http-0.1.0-py*.egg $DIR/../. 12 | 13 | # Remove folders created during packaging 14 | rm -rf build 15 | rm -rf dist 16 | rm -rf test_pyfile_egg_pkg_from_http.egg-info -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='test_pyfile_egg_pkg_from_http', 5 | version='0.1.0', 6 | description='A short description', 7 | long_description='A long description', 8 | author='Rocket', 9 | author_email='rocket@stratio.com', 10 | packages=find_packages(exclude=['*tests*']), 11 | ) 12 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/test_pyfile_egg.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_egg(x): 3 | return f"Hello '{x}' from egg pkg placed in a Http server - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_from_hdfs.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_script(x): 3 | return f"Hello '{x}' from python script placed in Hdfs - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_from_http.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_script(x): 3 | return f"Hello '{x}' from python script placed in a Http server - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r test_pyfile_zip_pkg_from_hdfs.zip test_pyfile_zip_pkg_from_hdfs 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/test_pyfile_zip_pkg_from_hdfs.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/test_pyfile_zip.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_zip(x): 3 | return f"Hello '{x}' from python zip pkg placed in Hdfs - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r test_pyfile_zip_pkg_from_http.zip test_pyfile_zip_pkg_from_http 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/test_pyfile_zip_pkg_from_http.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/test_pyfile_zip.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_zip(x): 3 | return f"Hello '{x}' from python zip pkg placed in a Http server - V1" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_hdfs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_hdfs.zip -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_http.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v1/test_pyfile_zip_pkg_from_http.zip -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/make_packages.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | ./test_pyfile_egg_from_hdfs/do_package.sh 8 | ./test_pyfile_egg_from_http/do_package.sh 9 | ./test_pyfile_zip_from_hdfs/do_package.sh 10 | ./test_pyfile_zip_from_http/do_package.sh -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging 8 | python3 setup.py sdist bdist_egg 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/dist/test_pyfile_egg_pkg_from_hdfs-0.1.0-py*.egg $DIR/../. 12 | 13 | # Remove folders created during packaging 14 | rm -rf build 15 | rm -rf dist 16 | rm -rf test_pyfile_egg_pkg_from_hdfs.egg-info -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='test_pyfile_egg_pkg_from_hdfs', 5 | version='0.1.0', 6 | description='A short description', 7 | long_description='A long description', 8 | author='Rocket', 9 | author_email='rocket@stratio.com', 10 | packages=find_packages(exclude=['*tests*']), 11 | ) 12 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_hdfs/test_pyfile_egg_pkg_from_hdfs/test_pyfile_egg.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_egg(x): 3 | return f"Hello '{x}' from egg pkg placed in Hdfs - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging 8 | python3 setup.py sdist bdist_egg 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/dist/test_pyfile_egg_pkg_from_http-0.1.0-py*.egg $DIR/../. 12 | 13 | # Remove folders created during packaging 14 | rm -rf build 15 | rm -rf dist 16 | rm -rf test_pyfile_egg_pkg_from_http.egg-info -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='test_pyfile_egg_pkg_from_http', 5 | version='0.1.0', 6 | description='A short description', 7 | long_description='A long description', 8 | author='Rocket', 9 | author_email='rocket@stratio.com', 10 | packages=find_packages(exclude=['*tests*']), 11 | ) 12 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_from_http/test_pyfile_egg_pkg_from_http/test_pyfile_egg.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_egg(x): 3 | return f"Hello '{x}' from egg pkg placed in a Http server - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_hdfs-0.1.0-py3.7.egg -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_egg_pkg_from_http-0.1.0-py3.7.egg -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_from_hdfs.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_script(x): 3 | return f"Hello '{x}' from python script placed in Hdfs - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_from_http.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_script(x): 3 | return f"Hello '{x}' from python script placed in a Http server - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r test_pyfile_zip_pkg_from_hdfs.zip test_pyfile_zip_pkg_from_hdfs 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/test_pyfile_zip_pkg_from_hdfs.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_hdfs/test_pyfile_zip_pkg_from_hdfs/test_pyfile_zip.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_zip(x): 3 | return f"Hello '{x}' from python zip pkg placed in Hdfs - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/do_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Current script directory 4 | DIR="$(dirname "$(readlink -f "$0")")" 5 | cd $DIR 6 | 7 | # Packaging - Zipping module 8 | zip -r test_pyfile_zip_pkg_from_http.zip test_pyfile_zip_pkg_from_http 9 | 10 | # Move packaged artifact to parent folder 11 | mv $DIR/test_pyfile_zip_pkg_from_http.zip $DIR/../. -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/__init__.py -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_from_http/test_pyfile_zip_pkg_from_http/test_pyfile_zip.py: -------------------------------------------------------------------------------- 1 | 2 | def func_test_pyfile_zip(x): 3 | return f"Hello '{x}' from python zip pkg placed in a Http server - V2" 4 | -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_hdfs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_hdfs.zip -------------------------------------------------------------------------------- /rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_http.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-python-extensions/pyspark-native-extensions/qa-examples/version_test/v2/test_pyfile_zip_pkg_from_http.zip -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/create_pipeline_estimator.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml import Pipeline 2 | from pyspark.sql import SparkSession 3 | 4 | from rocket_pyspark_ml.simple_custom_estimator import NormalDeviation 5 | 6 | spark = SparkSession.builder \ 7 | .master("local") \ 8 | .appName("test") \ 9 | .getOrCreate() 10 | 11 | df = spark.sparkContext.parallelize([(1, 2.0), (2, 3.0), (3, 0.0), (4, 99.0)]).toDF(["id", "x"]) 12 | 13 | normal_deviation = NormalDeviation().setInputCol("x").setCenteredThreshold(1.0) 14 | 15 | pipeline = Pipeline(stages=[normal_deviation]) 16 | 17 | model = pipeline.fit(df) 18 | 19 | out_df = model.transform(df) 20 | out_df.show() 21 | 22 | pipeline.write().overwrite().save("/tmp/my_custom_pipeline") 23 | model.write().overwrite().save("/tmp/my_custom_model_from_custom_pipeline") 24 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/create_pipeline_model.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml import Pipeline, PipelineModel 2 | from pyspark.ml.classification import LogisticRegression 3 | from pyspark.ml.feature import HashingTF, Tokenizer 4 | from pyspark.sql import SparkSession 5 | 6 | from rocket_pyspark_ml.simple_custom_transformer import LiteralColumnAdder 7 | 8 | spark = SparkSession.builder \ 9 | .master("local") \ 10 | .appName("test") \ 11 | .getOrCreate() 12 | 13 | # Prepare training documents from a list of (id, text, label) tuples. 14 | df = spark.createDataFrame([ 15 | (0, "a b c d e spark", 1.0), 16 | (1, "b d", 0.0), 17 | (2, "spark f g h", 1.0), 18 | (3, "hadoop mapreduce", 3.0) 19 | ], ["id", "text", "label"]) 20 | 21 | # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. 22 | tokenizer = Tokenizer(inputCol="text", outputCol="words") 23 | hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) 24 | lr = LogisticRegression(maxIter=10, regParam=0.001) 25 | 26 | # Custom transformer 27 | custom = LiteralColumnAdder() 28 | 29 | 30 | sub_pipeline = Pipeline(stages=[custom, tokenizer, hashingTF, lr]) 31 | model = sub_pipeline.fit(df) 32 | 33 | model.write().overwrite().save("/tmp/my_custom_model") 34 | 35 | loaded_model = PipelineModel.load("/tmp/my_custom_model") 36 | 37 | loaded_model.transform(df).show() -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/dist/rocket_pyspark_ml-0.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-pyspark-ml/dist/rocket_pyspark_ml-0.1.0.tar.gz -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### Python package with custom PySpark estimator and transformer 3 | 4 | #### How to upload to Nexus 5 | 6 | Upload → utilidad twine 7 | 8 | pip instal twine 9 | 10 | Fichero de configuración para apuntar a repositorio externo: 11 | 12 | gedit ~/.pypirc 13 | 14 | [distutils] 15 | index-servers = pypi 16 | [pypi] 17 | repository: https://nexus.s000001.xray.labs.stratio.com/repository/rocket-pip-internal/ 18 | username: admin 19 | password: 1234 20 | 21 | Por linea de comandos: 22 | 23 | twine upload XXX.tar.gz --cert ~/workspace/entornos/xray/ca.crt --verbose 24 | 25 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/__init__.py -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/simple_custom_transformer.py: -------------------------------------------------------------------------------- 1 | from pyspark import keyword_only 2 | from pyspark.ml import Transformer 3 | from pyspark.ml.param.shared import * 4 | from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable 5 | from pyspark.sql import DataFrame 6 | from pyspark.sql.functions import lit 7 | 8 | 9 | class HasLiteralValue(Params): 10 | literalValue = Param( 11 | Params._dummy(), "literalValue", "literalValue", typeConverter=TypeConverters.toFloat 12 | ) 13 | 14 | def __init__(self): 15 | super(HasLiteralValue, self).__init__() 16 | self._setDefault(literalValue=1.0) 17 | 18 | def setLiteralValue(self, value): 19 | return self._set(literalValue=value) 20 | 21 | def getLiteralValue(self): 22 | return self.getOrDefault(self.literalValue) 23 | 24 | 25 | class LiteralColumnAdder(Transformer, HasLiteralValue, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable): 26 | """ 27 | A custom Transformer which drops all columns that have at least one of the 28 | words from the banned_list in the name. 29 | """ 30 | 31 | @keyword_only 32 | def __init__(self): 33 | super(LiteralColumnAdder, self).__init__() 34 | 35 | def _transform(self, df: DataFrame) -> DataFrame: 36 | return df.withColumn(self.getOutputCol(), lit(self.getLiteralValue())) 37 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/rocket_pyspark_ml/test.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def dummy_func(x): 4 | return "Dummy code - {}".format(str(x)) 5 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-pyspark-ml/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # 3 | # © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 4 | # 5 | # This software – including all its source code – contains proprietary 6 | # information of Stratio Big Data Inc., Sucursal en España and 7 | # may not be revealed, sold, transferred, modified, distributed or 8 | # otherwise made available, licensed or sublicensed to third parties; 9 | # nor reverse engineered, disassembled or decompiled, without express 10 | # written authorization from Stratio Big Data Inc., Sucursal en España. 11 | # 12 | 13 | import os 14 | 15 | from setuptools import setup 16 | 17 | pjoin = os.path.join 18 | 19 | here = os.path.abspath(os.path.dirname(__file__)) 20 | 21 | packages = [] 22 | for d, _, _ in os.walk('rocket_pyspark_ml'): 23 | if os.path.exists(pjoin(d, '__init__.py')): 24 | packages.append(d.replace(os.path.sep, '.')) 25 | 26 | 27 | def setup_package(): 28 | metadata = dict( 29 | name='rocket_pyspark_ml', 30 | packages=packages, 31 | description="""Rocket Pyspark ml""", 32 | long_description="Stratio Rocket - PySpark ml custom stages", 33 | author="Stratio Rocket", 34 | platforms="Linux", 35 | install_requires=[], 36 | version="0.1.0", 37 | keywords=['Rocket', 'PySpark', "ML"], 38 | classifiers=['Programming Language :: Python :: 3.7'], 39 | ) 40 | 41 | setup(**metadata) 42 | 43 | 44 | if __name__ == '__main__': 45 | setup_package() 46 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-spark-ml/dist/rocketSparkMl-0.1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/rocket-examples/45faf6ee9cc9c74373cf9bc7114f3016c594c763/rocket-spark-ml-custom-stages/rocket-spark-ml/dist/rocketSparkMl-0.1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-spark-ml/src/main/scala/org/apache/spark/ml/rocket/features/SimpleCustomTransformer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.rocket.features 2 | 3 | import org.apache.spark.ml.Transformer 4 | import org.apache.spark.ml.param.{Param, ParamMap} 5 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.types._ 8 | import org.apache.spark.sql.{DataFrame, Dataset} 9 | 10 | /* 11 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 12 | * 13 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 14 | */ 15 | 16 | 17 | 18 | class ConfigurableWordCount( 19 | override val uid: String 20 | ) extends Transformer with DefaultParamsWritable { 21 | 22 | def this() = this(Identifiable.randomUID("configurablewordcount")) 23 | 24 | // ----------------- 25 | // Parameters 26 | // ----------------- 27 | 28 | final val inputCol = new Param[String](this, "inputCol", "The input column") 29 | final val outputCol = new Param[String](this, "outputCol", "The output column") 30 | 31 | def setInputCol(value: String): this.type = set(inputCol, value) 32 | 33 | def setOutputCol(value: String): this.type = set(outputCol, value) 34 | 35 | override def transformSchema(schema: StructType): StructType = { 36 | // Check that the input type is a string 37 | val idx = schema.fieldIndex($(inputCol)) 38 | val field = schema.fields(idx) 39 | if (field.dataType != StringType) { 40 | throw new Exception(s"Input type ${field.dataType} did not match input type StringType") 41 | } 42 | // Add the return field 43 | schema.add(StructField($(outputCol), IntegerType, false)) 44 | } 45 | 46 | def transform(df: Dataset[_]): DataFrame = { 47 | val wordcount = udf { in: String => in.split(" ").length } 48 | df.select(col("*"), wordcount(df.col($(inputCol))).as($(outputCol))) 49 | } 50 | 51 | def copy(extra: ParamMap): ConfigurableWordCount = { 52 | defaultCopy(extra) 53 | } 54 | } 55 | 56 | object ConfigurableWordCount extends DefaultParamsReadable[ConfigurableWordCount]{ 57 | 58 | override def load(path: String): ConfigurableWordCount = super.load(path) 59 | } 60 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/RocketSparkMlBeforeAndAfterAll.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.ml.rocket 8 | 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.sql.SparkSession 11 | import org.scalatest.{BeforeAndAfterAll, Suite} 12 | 13 | 14 | trait RocketSparkMlBeforeAndAfterAll extends BeforeAndAfterAll { self: Suite => 15 | 16 | @transient var spark: SparkSession = _ 17 | @transient var sc: SparkContext = _ 18 | //@transient var sqlContext: SQLContext = _ 19 | 20 | override def beforeAll() { 21 | super.beforeAll() 22 | val sparkMasterIp = System.getProperty("spark.master", "local[2]") 23 | spark = SparkSession 24 | .builder().master(sparkMasterIp) 25 | .appName("RocketSparkMlUnitTest") 26 | .getOrCreate() 27 | sc = spark.sparkContext 28 | } 29 | 30 | override def afterAll() { 31 | if(spark != null) 32 | spark.stop() 33 | super.afterAll() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/RocketSparkMlFunSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.ml.rocket 8 | 9 | import org.scalatest.{FunSuite, Outcome} 10 | 11 | 12 | abstract class RocketSparkMlFunSuite extends FunSuite with RocketSparkMlBeforeAndAfterAll { 13 | 14 | /** 15 | * Log the suite name and the test name before and after each test. 16 | * 17 | * Subclasses should never override this method. If they wish to run 18 | * custom code before and after each test, they should mix in the 19 | * {{org.scalatest.BeforeAndAfter}} trait instead. 20 | */ 21 | 22 | final protected override def withFixture(test: NoArgTest): Outcome = { 23 | val testName = test.text 24 | val suiteName = this.getClass.getName 25 | val shortSuiteName = suiteName.replaceAll("com.stratio.intelligence", "c.s.i") 26 | try { 27 | print(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") 28 | test() 29 | 30 | } finally { 31 | print(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /rocket-spark-ml-custom-stages/rocket-spark-ml/src/test/scala/org/apache/spark/ml/rocket/features/SimpleCustomEstimatorTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.ml.rocket.features 8 | 9 | import org.apache.spark.ml.{Pipeline, PipelineModel} 10 | import org.apache.spark.ml.classification.LogisticRegression 11 | import org.apache.spark.ml.feature.{HashingTF, Tokenizer} 12 | import org.apache.spark.ml.rocket.RocketSparkMlFunSuite 13 | import org.junit.runner.RunWith 14 | import org.scalatestplus.junit.JUnitRunner 15 | 16 | @RunWith(classOf[JUnitRunner]) 17 | class SimpleCustomEstimatorTest extends RocketSparkMlFunSuite{ 18 | 19 | test("Example"){ 20 | 21 | // => Prepare training documents from a list of (id, text, label) tuples. 22 | val df = spark.createDataFrame(Seq( 23 | (0L, "a b c d e spark", 1.0, "cat0"), 24 | (1L, "b d", 0.0, "cat0"), 25 | (2L, "spark f g h", 1.0, "cat1"), 26 | (3L, "hadoop mapreduce", 0.0, "cat2") 27 | )).toDF("id", "text", "label", "cat") 28 | 29 | // · Feature engineering 30 | val si = new SimpleIndexer().setInputCol("cat").setOutputCol("idx_cat") 31 | val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") 32 | val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features") 33 | val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.001) 34 | val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, si, lr)) 35 | 36 | 37 | val pipelineModel = pipeline.fit(df) 38 | 39 | val outDf = pipelineModel.transform(df) 40 | outDf.show() 41 | 42 | pipeline.write.overwrite().save("/tmp/custom_pipeline") 43 | val newPipeline = Pipeline.load("/tmp/custom_pipeline") 44 | 45 | pipelineModel.write.overwrite().save("/tmp/custom_pipeline_model") 46 | val newPipelineModel = PipelineModel.load("/tmp/custom_pipeline_model") 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /scripts/schema_convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | 6 | 7 | def true_str(language: str): 8 | return 'true' if language == 'scala' else 'True' 9 | 10 | 11 | def other_type(type_str: str, language: str): 12 | return f'{type_str.capitalize()}Type{"()" if language == "python" else ""}' 13 | 14 | 15 | def to_array(fields: list, language: str): 16 | return f'Seq({",".join(fields)})' if language == 'scala' else f'[{",".join(fields)}]' 17 | 18 | 19 | def struct_type(fields: list, language: str, name: str = None): 20 | array_str = to_array(fields, language) 21 | if name is not None: 22 | return f'StructField("{name}", StructType({array_str}), {true_str(language)})' 23 | else: 24 | return f'StructType({array_str})' 25 | 26 | 27 | def obtain_schema(node: dict, language: str): 28 | internal_node = node.get('type') 29 | if internal_node == 'struct': 30 | # Root struct 31 | fields = [obtain_schema(elem, language) 32 | for elem in node.get('fields', [])] 33 | return struct_type(fields, language) 34 | elif type(internal_node) == dict: 35 | # Internal struct or array type 36 | if internal_node.get('type') == 'struct': 37 | fields = [obtain_schema(elem, language) 38 | for elem in internal_node.get('fields', [])] 39 | return struct_type(fields, language, node.get("name")) 40 | elif internal_node.get('type') == 'array': 41 | element_type = internal_node.get('elementType', []) 42 | if type(element_type) == dict: 43 | type_str = obtain_schema(element_type, language) 44 | else: 45 | type_str = other_type(element_type, language) 46 | return f'StructField("{node.get("name")}", ArrayType({type_str}), {true_str(language)})' 47 | 48 | else: 49 | # Other internal type 50 | return f'StructField("{node.get("name")}", {other_type(internal_node,language)}, {true_str(language)})' 51 | 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser( 55 | description='Convert Rocket debug schema to Scala/Python schema') 56 | parser.add_argument( 57 | "--input", 58 | type=str, 59 | help="Input file containing debug schema", 60 | required=True 61 | ) 62 | parser.add_argument( 63 | "--language", 64 | type=str, 65 | default='scala', 66 | help="Language of the schema, must be 'scala' or 'python'", 67 | choices=['python', 'scala'] 68 | ) 69 | args = parser.parse_args() 70 | with open(args.input) as file: 71 | print(obtain_schema(json.load(file), args.language)) 72 | -------------------------------------------------------------------------------- /scripts/sso.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 7 | 8 | # Disable InsecureRequestWarning from the logs 9 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 10 | 11 | needed_cookies = ["user", "mesosphere_server_id"] 12 | 13 | def login_sso(url, username, password): 14 | """ 15 | Function that simulates the login in to sparta endpoint flow with SSO to obtain a valid 16 | cookie that will be used to make requests to Marathon 17 | """ 18 | # First request to mesos master to be redirected to gosec sso login 19 | # page and be given a session cookie 20 | r = requests.Session() 21 | first_response = r.get(url, verify=False) 22 | callback_url = first_response.url 23 | 24 | # Parse response body for hidden tags needed in the data of our login post request 25 | body = first_response.text 26 | all_tags = BeautifulSoup(body, "lxml").find_all("input", type="hidden") 27 | tags_to_find = ['lt', 'execution'] 28 | hidden_tags = [tag.attrs for tag in all_tags if tag['name'] in tags_to_find] 29 | data = {tag['name']: tag['value'] for tag in hidden_tags} 30 | 31 | # Add the rest of needed fields and login credentials in the data of 32 | # our login post request and send it 33 | data.update({ 34 | '_eventId': 'submit', 35 | 'submit': 'LOGIN', 36 | 'username': username, 37 | 'password': password, 38 | 'tenant': 'NONE' 39 | }) 40 | login_response = r.post(callback_url, data=data, verify=False) 41 | return login_response 42 | 43 | 44 | def get_cookies(url, usr, passwd): 45 | for (k,v) in login_sso(url, usr, passwd).request._cookies.items(): 46 | if k in needed_cookies: 47 | yield "{}={}".format(k,v) 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser(description='Obtain SSO cookies.') 52 | parser.add_argument( 53 | "--user", 54 | dest="user", 55 | type=str, 56 | help="User to login in SSO", 57 | nargs="?" 58 | ) 59 | parser.add_argument( 60 | "--password", 61 | dest="password", 62 | type=str, 63 | help="Password to login in SSO", 64 | nargs="?" 65 | ) 66 | parser.add_argument( 67 | "--url", 68 | dest="url", 69 | type=str, 70 | help="Password to login in SSO", 71 | nargs="?" 72 | ) 73 | args = parser.parse_args() 74 | cookies = "; ".join(get_cookies(args.url, args.user, args.password)) 75 | print("Cookie: {}".format(cookies)) -------------------------------------------------------------------------------- /sparta-plugins/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.batch.models._ 9 | import com.stratio.sparta.sdk.lite.validation.ValidationResult 10 | import com.stratio.sparta.sdk.lite.xd.batch._ 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 13 | import org.apache.spark.sql.crossdata.XDSession 14 | import org.apache.spark.sql.types._ 15 | 16 | class GeneratorXDLiteInputStepBatch( 17 | xdSession: XDSession, 18 | properties: Map[String, String] 19 | ) 20 | extends LiteCustomXDBatchInput(xdSession, properties) { 21 | 22 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 23 | lazy val rawData: Option[String] = properties.get("raw").map(_.toString) 24 | 25 | 26 | override def validate(): ValidationResult = { 27 | var validation = ValidationResult(valid = true, messages = Seq.empty) 28 | 29 | if (rawData.isEmpty) { 30 | validation = ValidationResult( 31 | valid = false, 32 | messages = validation.messages :+ "Test data must be set inside the Option properties with an option key named 'raw'") 33 | } 34 | 35 | if (rawData.map(_.trim).forall(_.isEmpty)) { 36 | validation = ValidationResult( 37 | valid = false, 38 | messages = validation.messages :+ "Generated data cannot be an empty string") 39 | } 40 | validation 41 | } 42 | 43 | override def init(): ResultBatchData = { 44 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 45 | val defaultRDD = xdSession.sparkContext.parallelize(register) 46 | 47 | ResultBatchData(defaultRDD, Option(stringSchema)) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /sparta-plugins/input-lite-xd/src/main/scala/com/stratio/sparta/GeneratorXDLiteInputStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.sql.functions._ 12 | import com.stratio.sparta.sdk.lite.xd.streaming._ 13 | import com.stratio.sparta.sdk.lite.streaming.models._ 14 | import org.apache.spark.sql.crossdata.XDSession 15 | 16 | import scala.util.{Failure, Success, Try} 17 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 18 | import org.apache.spark.sql.types._ 19 | import org.apache.spark.streaming.StreamingContext 20 | 21 | import scala.collection.mutable 22 | 23 | class GeneratorXDLiteInputStepStreaming( 24 | xdSession: XDSession, 25 | streamingContext: StreamingContext, 26 | properties: Map[String, String] 27 | ) 28 | extends LiteCustomXDStreamingInput(xdSession, streamingContext, properties) { 29 | 30 | lazy val stringSchema = StructType(Seq(StructField("raw", StringType))) 31 | 32 | override def init(): ResultStreamingData = { 33 | val dataQueue = new mutable.Queue[RDD[Row]]() 34 | val register = Seq(new GenericRowWithSchema(Array("test-data"), stringSchema).asInstanceOf[Row]) 35 | dataQueue += xdSession.sparkContext.parallelize(register) 36 | val stream = streamingContext.queueStream(dataQueue) 37 | 38 | ResultStreamingData(stream, Option(stringSchema)) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /sparta-plugins/output-lite-xd/src/main/scala/com/stratio/sparta/LoggerXDLiteOutputStep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta 7 | 8 | import com.stratio.sparta.sdk.lite.common.models.OutputOptions 9 | import com.stratio.sparta.sdk.lite.xd.common._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.crossdata.XDSession 12 | 13 | import scala.util.{Failure, Success, Try} 14 | 15 | class LoggerXDLiteOutputStep( 16 | xdSession: XDSession, 17 | properties: Map[String, String] 18 | ) 19 | extends LiteCustomXDOutput(xdSession, properties) { 20 | 21 | lazy val metadataEnabled = properties.get("metadataEnabled") match { 22 | case Some(value: String) => Try(value.toBoolean) match { 23 | case Success(v) => v 24 | case Failure(ex) => throw new IllegalStateException(s"$value not parseable to boolean", ex) 25 | } 26 | case None => throw new IllegalStateException("'metadataEnabled' key must be defined in the Option properties") 27 | } 28 | 29 | override def save(data: DataFrame, outputOptions: OutputOptions): Unit = { 30 | val tableName = outputOptions.tableName.getOrElse{ 31 | logger.error("Table name not defined") 32 | throw new NoSuchElementException("tableName not found in options")} 33 | 34 | if (metadataEnabled){ 35 | logger.info(s"Table name: $tableName") 36 | logger.info(s"Save mode is set to ${outputOptions.saveMode}") 37 | } 38 | data.foreach{ row => 39 | println(row.mkString(",")) 40 | } 41 | } 42 | 43 | override def save(data: DataFrame, saveMode: String, saveOptions: Map[String, String]): Unit = () 44 | } 45 | -------------------------------------------------------------------------------- /sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/properties/ValidatePropertiesMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.properties 7 | 8 | class ValidatePropertiesMap [K, V](val m: Map[K, V]){ 9 | def getString(key: K): String = 10 | m.get(key) match { 11 | case Some(value: String) => value 12 | case Some(value) => value.toString 13 | case None => 14 | throw new IllegalStateException(s"$key is mandatory") 15 | } 16 | 17 | def notBlank(option: Option[String]): Boolean = 18 | option.map(_.trim).forall(_.isEmpty) 19 | } 20 | 21 | class NotBlankOption(s: Option[String]) { 22 | def notBlank: Option[String] = s.map(_.trim).filterNot(_.isEmpty) 23 | } 24 | 25 | object ValidatingPropertyMap{ 26 | implicit def map2ValidatingPropertyMap[K, V](m: Map[K, V]): ValidatePropertiesMap[K, V] = 27 | new ValidatePropertiesMap[K, V](m) 28 | 29 | implicit def option2NotBlankOption(s: Option[String]): NotBlankOption = new NotBlankOption(s) 30 | } -------------------------------------------------------------------------------- /sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepBatch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import java.io.{Serializable => JSerializable} 9 | 10 | import com.stratio.sparta.sdk.lite.batch.models._ 11 | import com.stratio.sparta.sdk.lite.xd.batch._ 12 | import org.apache.spark.sql.crossdata.XDSession 13 | 14 | class RepartitionXDLiteTransformStepBatch( 15 | xdSession: XDSession, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDBatchTransform(xdSession, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultBatchData]): OutputBatchTransformData = { 20 | val inputStream = inputData.head._2.data 21 | 22 | OutputBatchTransformData(inputStream.repartition(5)) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /sparta-plugins/transformation-lite-xd/src/main/scala/com/stratio/sparta/transformations/repartition/RepartitionXDLiteTransformStepStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | package com.stratio.sparta.transformations.repartition 7 | 8 | import com.stratio.sparta.sdk.lite.streaming.models._ 9 | import com.stratio.sparta.sdk.lite.xd.streaming._ 10 | import org.apache.spark.sql.crossdata.XDSession 11 | import org.apache.spark.streaming.StreamingContext 12 | 13 | class RepartitionXDLiteTransformStepStreaming( 14 | xdSession: XDSession, 15 | streamingContext: StreamingContext, 16 | properties: Map[String, String] 17 | ) extends LiteCustomXDStreamingTransform(xdSession, streamingContext, properties) { 18 | 19 | override def transform(inputData: Map[String, ResultStreamingData]): OutputStreamingTransformData = { 20 | val newStream = inputData.head._2.data.transform { rdd => 21 | rdd.repartition(5) 22 | } 23 | 24 | OutputStreamingTransformData(newStream) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /sparta-plugins/transformation-lite-xd/src/test/scala/com/stratio/sparta/transformations/tokenizer/TokenizerTransformStepBatchTest.scala: -------------------------------------------------------------------------------- 1 | package com.stratio.sparta.transformations.tokenizer 2 | 3 | import com.stratio.sparta.sdk.lite.batch.models.ResultBatchData 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.crossdata.XDSession 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.junit.runner.RunWith 10 | import org.scalatest.junit.JUnitRunner 11 | import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} 12 | 13 | @RunWith(classOf[JUnitRunner]) 14 | class TokenizerTransformStepBatchTest extends FlatSpec with Matchers with BeforeAndAfterAll { 15 | val sc = SparkContext.getOrCreate(new SparkConf().setAppName("test").setMaster("local[1]")) 16 | val xdSession: XDSession = XDSession.builder().config(sc.getConf).create("dummyUser") 17 | 18 | val names = "jose,perez" 19 | val inputField = "raw" 20 | val inputSchema = StructType(Seq(StructField(inputField, StringType))) 21 | val dataIn: RDD[Row] = sc.parallelize(Seq(Row(names))) 22 | 23 | val properties = Map( 24 | "charPattern" -> ",", 25 | "inputField" -> "raw", 26 | "outputField1" -> "firstName", 27 | "outputField2" -> "lastName" 28 | ) 29 | 30 | val inBatch = ResultBatchData(dataIn, Option(inputSchema)) 31 | val tokenizer = new TokenizerTransformStepBatch(xdSession, properties) 32 | 33 | "TokenizerTransformStepBatch" should "tokenize the data in and return two values" in { 34 | val result = tokenizer.transform(Map("custom-transform" -> inBatch)).data.first().toSeq 35 | 36 | result.size shouldBe 2 37 | } 38 | } -------------------------------------------------------------------------------- /sparta-plugins/udf/src/main/scala/com/stratio/sparta/SpartaExampleUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package com.stratio.sparta 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.catalyst.ScalaReflection 11 | import org.apache.spark.sql.expressions.UserDefinedFunction 12 | import org.apache.spark.sql.types.StringType 13 | 14 | import scala.util.Try 15 | 16 | case class ToUpperCaseUDF() extends SpartaUDF { 17 | 18 | val name = "uppercaseSparta" 19 | 20 | val upper: String => String = _.toUpperCase 21 | 22 | val userDefinedFunction: UserDefinedFunction = 23 | UserDefinedFunction(upper , StringType, Option(Seq(StringType))) 24 | } 25 | 26 | case class ConcatUDF() extends SpartaUDF { 27 | 28 | val name = "concatSparta" 29 | 30 | val upper: (String, String) => String = { case (str1, str2) => 31 | s"$str1/$str2" 32 | } 33 | 34 | val userDefinedFunction: UserDefinedFunction = 35 | UserDefinedFunction(upper , StringType, Option(Seq(StringType, StringType))) 36 | } 37 | 38 | case class ToUpperCaseWithReflectionUDF() extends SpartaUDF { 39 | 40 | val name = "upperCaseReflect" 41 | 42 | val upper: String => String = _.toUpperCase 43 | 44 | val userDefinedFunction: UserDefinedFunction = { 45 | val inputTypes = Try(ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType :: Nil).toOption 46 | UserDefinedFunction(upper , ScalaReflection.schemaFor(ScalaReflection.localTypeOf[String]).dataType, inputTypes) 47 | } 48 | } -------------------------------------------------------------------------------- /sparta-plugins/udf/src/main/scala/org/apache/spark/sql/sparta/udf/GetDenseVectorUDF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2017 Stratio Big Data Inc., Sucursal en España. All rights reserved. 3 | * 4 | * This software – including all its source code – contains proprietary information of Stratio Big Data Inc., Sucursal en España and may not be revealed, sold, transferred, modified, distributed or otherwise made available, licensed or sublicensed to third parties; nor reverse engineered, disassembled or decompiled, without express written authorization from Stratio Big Data Inc., Sucursal en España. 5 | */ 6 | 7 | package org.apache.spark.sql.sparta.udf 8 | 9 | import com.stratio.sparta.sdk.lite.common.SpartaUDF 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import org.apache.spark.sql.types.{DoubleType, IntegerType} 12 | import org.apache.spark.ml.linalg.VectorUDT 13 | 14 | case object VectorUDT extends VectorUDT 15 | 16 | case class GetDenseVectorUDF() extends SpartaUDF { 17 | 18 | val name = "get_vector_ith_element" 19 | 20 | val getVectorElement = (vector: org.apache.spark.ml.linalg.Vector, num: Int) => vector(num) 21 | 22 | val userDefinedFunction: UserDefinedFunction = 23 | UserDefinedFunction(getVectorElement , DoubleType, Option(Seq(VectorUDT, IntegerType))) 24 | } 25 | 26 | 27 | 28 | --------------------------------------------------------------------------------