├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── build.sbt ├── log4j.properties ├── project ├── assembly.sbt └── plugins.sbt ├── python ├── .gitignore ├── pyspark_elastic │ ├── __init__.py │ ├── context.py │ ├── rdd.py │ ├── tests.py │ ├── types.py │ └── util.py └── setup.py ├── sbin ├── local.sh └── released.sh ├── src └── main │ └── scala │ ├── pyspark_elastic │ └── PythonHelper.scala │ └── pyspark_util └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # maven / sbt 2 | target 3 | .cache 4 | bin 5 | .sp-creds.txt 6 | 7 | # python 8 | *.pyc 9 | venv* 10 | 11 | # eclipse 12 | .classpath 13 | .project 14 | .pydevproject 15 | .settings 16 | 17 | #testing 18 | lib 19 | .ccm 20 | metastore_db 21 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pyspark-util"] 2 | path = pyspark-util 3 | url = https://github.com/TargetHolding/pyspark-util.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | scala: 4 | - "2.10.5" 5 | 6 | jdk: 7 | - oraclejdk7 8 | 9 | env: 10 | - SPARK_VERSION=1.4.1 SPARK_PACKAGE_TYPE=hadoop2.6 11 | - SPARK_VERSION=1.5.2 SPARK_PACKAGE_TYPE=hadoop2.6 12 | - SPARK_VERSION=1.6.1 SPARK_PACKAGE_TYPE=hadoop2.6 13 | 14 | addons: 15 | apt: 16 | packages: 17 | - build-essential 18 | - python-dev 19 | - python-pip 20 | - libev4 21 | - libev-dev 22 | 23 | install: 24 | - PYTHONUSERBASE=$HOME/.local pip install --user elasticsearch-dsl 25 | 26 | services: 27 | - elasticsearch 28 | 29 | script: make clean dist test-travis 30 | 31 | sudo: false 32 | 33 | cache: 34 | directories: 35 | - $HOME/.m2 36 | - $HOME/.ivy2 37 | - $HOME/.sbt 38 | - $HOME/.local 39 | - $HOME/.cache/pip 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | THIRD-PARTY DEPENDENCIES 206 | ======================== 207 | Convenience copies of some third-party dependencies are distributed with 208 | Apache Cassandra as Java jar files in lib/. Licensing information for 209 | these files can be found in the lib/licenses directory. 210 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL = /bin/bash 2 | VERSION = $(shell cat version.txt) 3 | 4 | .PHONY: clean clean-pyc clean-dist dist test-travis 5 | 6 | 7 | 8 | clean: clean-dist clean-pyc 9 | 10 | clean-pyc: 11 | find . -name '*.pyc' -exec rm -f {} + 12 | find . -name '*.pyo' -exec rm -f {} + 13 | find . -name '*~' -exec rm -f {} + 14 | find . -name '__pycache__' -exec rm -fr {} + 15 | 16 | clean-dist: 17 | rm -rf target 18 | rm -rf python/build/ 19 | rm -rf python/*.egg-info 20 | 21 | 22 | 23 | install-venv: 24 | test -d venv || virtualenv venv 25 | 26 | install-elastic-driver: install-venv 27 | venv/bin/pip install elasticsearch-dsl 28 | 29 | install-elastic: 30 | export URL=https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/tar/elasticsearch/2.1.0/elasticsearch-2.1.0.tar.gz ; \ 31 | mkdir -p lib ; cd lib ; \ 32 | test -d elasticsearch-2.1.0 || curl $$URL | tar xz 33 | 34 | start-elastic: install-elastic 35 | nohup lib/elasticsearch-2.1.0/bin/elasticsearch > elastic.log 2>&1 & echo $$! > elastic.pid 36 | 37 | stop-elastic: 38 | kill `cat elastic.pid` 39 | 40 | 41 | 42 | test: test-python test-scala test-integration 43 | 44 | test-python: 45 | 46 | test-scala: 47 | 48 | 49 | 50 | test-integration: \ 51 | test-integration-setup \ 52 | test-integration-matrix \ 53 | test-integration-teardown 54 | 55 | test-integration-setup: \ 56 | start-elastic 57 | 58 | test-integration-teardown: \ 59 | stop-elastic 60 | 61 | test-integration-matrix: \ 62 | install-elastic-driver \ 63 | test-integration-spark-1.4.1 \ 64 | test-integration-spark-1.5.0 \ 65 | test-integration-spark-1.5.1 \ 66 | test-integration-spark-1.5.2 \ 67 | test-integration-spark-1.6.0 \ 68 | test-integration-spark-1.6.1 69 | 70 | test-travis: 71 | $(call test-integration-for-version,$$SPARK_VERSION,$$SPARK_PACKAGE_TYPE) 72 | 73 | test-integration-spark-1.4.1: 74 | $(call test-integration-for-version,1.4.1,hadoop2.6) 75 | 76 | test-integration-spark-1.5.0: 77 | $(call test-integration-for-version,1.5.0,hadoop2.6) 78 | 79 | test-integration-spark-1.5.1: 80 | $(call test-integration-for-version,1.5.1,hadoop2.6) 81 | 82 | test-integration-spark-1.5.2: 83 | $(call test-integration-for-version,1.5.2,hadoop2.6) 84 | 85 | test-integration-spark-1.6.0: 86 | $(call test-integration-for-version,1.6.0,hadoop2.6) 87 | 88 | test-integration-spark-1.6.1: 89 | $(call test-integration-for-version,1.6.1,hadoop2.6) 90 | 91 | define test-integration-for-version 92 | echo ====================================================================== 93 | echo testing integration with spark-$1 94 | 95 | mkdir -p lib && test -d lib/spark-$1-bin-$2 || \ 96 | (pushd lib && curl http://ftp.tudelft.nl/apache/spark/spark-$1/spark-$1-bin-$2.tgz | tar xz && popd) 97 | 98 | cp log4j.properties lib/spark-$1-bin-$2/conf/ 99 | 100 | source venv/bin/activate ; \ 101 | lib/spark-$1-bin-$2/bin/spark-submit \ 102 | --master local[*] \ 103 | --driver-memory 512m \ 104 | --jars target/scala-2.10/pyspark-elastic-assembly-$(VERSION).jar \ 105 | --py-files target/scala-2.10/pyspark-elastic-assembly-$(VERSION).jar \ 106 | python/pyspark_elastic/tests.py 107 | 108 | echo ====================================================================== 109 | endef 110 | 111 | 112 | 113 | dist: clean-pyc 114 | sbt assembly 115 | cd python ; \ 116 | find . -mindepth 2 -name '*.py' -print | \ 117 | zip ../target/scala-2.10/pyspark-elastic-assembly-$(VERSION).jar -@ 118 | 119 | 120 | all: clean dist 121 | 122 | 123 | publish: clean 124 | # use spark packages to create the distribution 125 | sbt spDist 126 | 127 | # push the python source files into the jar 128 | cd python ; \ 129 | find . -mindepth 2 -name '*.py' -print | \ 130 | zip ../target/scala-2.10/pyspark-elastic_2.10-$(VERSION).jar -@ 131 | 132 | # copy it to the right name, and update the jar in the zip 133 | cp target/scala-2.10/pyspark-elastic{_2.10,}-$(VERSION).jar 134 | cd target/scala-2.10 ;\ 135 | zip ../pyspark-elastic-$(VERSION).zip pyspark-elastic-$(VERSION).jar 136 | 137 | # send the package to spark-packages 138 | spark-package publish -c ".sp-creds.txt" -n "TargetHolding/pyspark-elastic" -v $(VERSION) -f . -z target/pyspark-elastic-$(VERSION).zip 139 | 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PySpark Elastic 2 | ================= 3 | 4 | [![Build Status](https://travis-ci.org/TargetHolding/pyspark-elastic.svg)](https://travis-ci.org/TargetHolding/pyspark-elastic) 5 | [![Codacy Badge](https://api.codacy.com/project/badge/grade/2b7c6849703a4aedb69c071b20049702)](https://www.codacy.com/app/frensjan/pyspark-elastic) 6 | 7 | PySpark Elastic provides python support for Apache Spark's Resillient Distributed Datasets from Elastic Search documents using [Elasticsearch Hadoop](https://www.elastic.co/products/hadoop) within PySpark, both in the interactive shell and in python programmes submitted with spark-submit. 8 | 9 | **Contents:** 10 | * [Compatibility](#compatibility) 11 | * [Using with PySpark](#using-with-pyspark) 12 | * [Using with PySpark shell](#using-with-pyspark-shell) 13 | * [Building](#building) 14 | * [API](#api) 15 | * [Examples](#examples) 16 | * [Problems / ideas?](#problems--ideas) 17 | * [Contributing](#contributing) 18 | 19 | 20 | 21 | Compatibility 22 | ------------- 23 | 24 | ### Spark 25 | PySpark Elastic is tested to be compatible with Spark 1.4, 1.5 and 1.6. Feedback on (in-)compatibility is much appreciated. 26 | 27 | ### Elastic Search 28 | PySpark Elastic is tested with Elastic Search 2.2. 29 | 30 | ### Python 31 | PySpark Elastic is tested with Python 2.7 and Python 3.4. 32 | 33 | 34 | Using with PySpark 35 | ------------------ 36 | 37 | ### With Spark Packages 38 | PySpark Elastic is published at [Spark Packages](http://spark-packages.org/package/TargetHolding/pyspark-elastic). This allows easy usage with Spark through: 39 | ```bash 40 | spark-submit \ 41 | --packages TargetHolding/pyspark-elastic: \ 42 | --conf spark.es.nodes=your,elastic,node,names 43 | ``` 44 | 45 | 46 | ### Without Spark Packages 47 | 48 | ```bash 49 | spark-submit \ 50 | --jars /path/to/pyspark_elastic-.jar \ 51 | --driver-class-path /path/to/pyspark_elastic-.jar \ 52 | --py-files target/pyspark_elastic_-.egg \ 53 | --conf spark.es.nodes=your,elastic,node,names \ 54 | --master spark://spark-master:7077 \ 55 | yourscript.py 56 | ``` 57 | (note that the the --driver-class-path due to [SPARK-5185](https://issues.apache.org/jira/browse/SPARK-5185)) 58 | 59 | 60 | 61 | Using with PySpark shell 62 | ------------------------ 63 | 64 | Replace `spark-submit` with `pyspark` to start the interactive shell and don't provide a script as argument and then import PySpark Elastic. Note that when performing this import the `sc` variable in pyspark is augmented with the `esRDD(...)` and `esJsonRDD(...)` methods. 65 | 66 | ```python 67 | import pyspark_elastic 68 | ``` 69 | 70 | 71 | 72 | Building 73 | -------- 74 | ### For [Spark Packages](http://spark-packages.org/package/TargetHolding/pyspark-elastic) Pyspark Elastic can be compiled using: 75 | ```bash 76 | sbt compile 77 | ``` 78 | The package can be published locally with: 79 | ```bash 80 | sbt spPublishLocal 81 | ``` 82 | The package can be published to Spark Packages with (requires authentication and authorization): 83 | ```bash 84 | sbt spPublish 85 | ``` 86 | 87 | ### For local testing / without Spark Packages 88 | A Java / JVM library as well as a python library is required to use PySpark Elastic. They can be built with: 89 | 90 | ```bash 91 | make dist 92 | ``` 93 | 94 | This creates 1) a fat jar with the Elasticsearch Hadoop library and additional classes for bridging Spark and PySpark for Elastic Search data and 2) a python source distribution at: 95 | 96 | * `target/scala-2.10/pyspark-elastic-assembly-.jar` 97 | * `target/pyspark_elastic_-.egg`. 98 | 99 | 100 | 101 | API 102 | --- 103 | 104 | The PySpark Elastic API aims to stay close to the Java / Scala APIs provided by Elastic Search. Reading its [documentation](https://www.elastic.co/guide/en/elasticsearch/hadoop/current/spark.html) is a good place to start. 105 | 106 | 107 | ### pyspark_elastic.EsSparkContext 108 | 109 | A `EsSparkContext` is very similar to a regular `SparkContext`. It is created in the same way, can be used to read files, parallelize local data, broadcast a variable, etc. See the [Spark Programming Guide](https://spark.apache.org/docs/1.2.0/programming-guide.html) for more details. *But* it exposes additional methods: 110 | 111 | * ``esRDD(resource_read, query, **kwargs)``: Returns a EsRDD for the resource and query with the JSON documents from Elastic parsed with `json.loads` (or from `cjson` or `ujson` if available). Arguments which can be provided: 112 | 113 | * `resource` is the index and document type seperated by a forward slash (/) 114 | * `query` is the query string to apply in searching Elastic Search for data for in the RDD 115 | * `**kwargs`: any configuration item listed in the [Elastic Search documentation]( https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html), see also [the configuration section below](#configuration) 116 | 117 | * ``esObjRDD(resource_read, query, **kwargs)``: As `esRDD(...)`, but the RDD contains JSON documents from Elastic parsed with `json.loads` where each `dict` is parsed into a `pyspark_elastic.types.AttrDict` so that object can be accessed by attributed as well as key: e.g. `sc.esObjRead(...).first().field`. 118 | 119 | * ``esJsonRDD(resource_read, query, **kwargs)``: As `esRDD(...)`, but the RDD contains JSON documents as strings. 120 | 121 | #### Configuration 122 | The configuration options from [Elastic Search documentation]( https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html) can be provided to the methods above _without_ the `es.` prefix _and with_ underscores instead of dots. The latter allows using normal keywords instead of resorting to constructs such as `esRDD(..., **{'es.configuration.option': 'xyz'})` and use `esRDD(..., configuration_option='xyx')`. 123 | 124 | ### pyspark.RDD 125 | 126 | PySpark Elastic supports saving arbitrary RDD's to Elastic using: 127 | 128 | * ``rdd.saveToEs(resource, **kwargs)``: Saves an RDD to resource (which is a / separated index and document type) by dumping the RDD elements using ``json.dumps``. 129 | * ``rdd.saveJsonToEs(resource, **kwargs)``: Saves an RDD to resource (which is a / separated index and document type) directly. The RDD must contain strings. 130 | 131 | 132 | 133 | ### pyspark_elastic.streaming 134 | 135 | Not yet implemented 136 | 137 | 138 | Examples 139 | -------- 140 | 141 | Creating a SparkContext with Elastic Search support 142 | 143 | ```python 144 | from pyspark_elastic import EsSparkContext 145 | 146 | conf = SparkConf() \ 147 | .setAppName("PySpark Elastic Test") \ 148 | .setMaster("spark://spark-master:7077") \ 149 | .set("spark.es.host", "elastic-1") 150 | 151 | sc = EsSparkContext(conf=conf) 152 | ``` 153 | 154 | Reading from an index as JSON strings: 155 | 156 | ```python 157 | rdd = sc.esJsonRDD('test/tweets') 158 | rdd... 159 | ``` 160 | 161 | Reading from an index as deserialized JSON (dicts, lists, etc.): 162 | 163 | ```python 164 | rdd = sc.esRDD('test/tweets') 165 | rdd... 166 | ``` 167 | 168 | Storing data in Elastic Search: 169 | 170 | ```python 171 | rdd = sc.parallelize([ 172 | { 'title': x, 'body', x } 173 | for x in ['a', 'b', 'c'] 174 | ]) 175 | 176 | rdd.saveToEs('test/docs') 177 | ``` 178 | 179 | 180 | 181 | Problems / ideas? 182 | ----------------- 183 | Feel free to use the issue tracker propose new functionality and / or report bugs. 184 | 185 | 186 | 187 | Contributing 188 | ------------ 189 | 190 | 1. Fork it 191 | 2. Create your feature branch (git checkout -b my-new-feature) 192 | 3. Commit your changes (git commit -am 'Add some feature') 193 | 4. Push to the branch (git push origin my-new-feature) 194 | 5. Create new Pull Request 195 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "pyspark-elastic" 2 | 3 | version := io.Source.fromFile("version.txt").mkString.trim 4 | 5 | organization := "TargetHolding" 6 | 7 | scalaVersion := "2.10.5" 8 | 9 | credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") 10 | 11 | licenses += "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0") 12 | 13 | libraryDependencies ++= Seq( 14 | "org.elasticsearch" %% "elasticsearch-spark" % "2.2.0-beta1" 15 | ) 16 | 17 | spName := "TargetHolding/pyspark-elastic" 18 | 19 | sparkVersion := "1.5.1" 20 | 21 | sparkComponents += "streaming" 22 | 23 | javacOptions ++= Seq("-source", "1.7", "-target", "1.7") 24 | 25 | assemblyOption in assembly := (assemblyOption in assembly).value.copy( 26 | includeScala = false 27 | ) 28 | 29 | mergeStrategy in assembly := { 30 | case PathList("org", "apache", "spark", "unused", "UnusedStubClass.class") => MergeStrategy.first 31 | case x => (mergeStrategy in assembly).value(x) 32 | } 33 | 34 | val ignore = Set( 35 | "commons-beanutils-1.7.0.jar", 36 | "commons-beanutils-core-1.8.0.jar", 37 | "commons-logging-1.1.1.jar", 38 | "hadoop-yarn-api-2.2.0.jar", 39 | "guava-14.0.1.jar", 40 | "kryo-2.21.jar" 41 | ) 42 | 43 | assemblyExcludedJars in assembly := { 44 | val cp = (fullClasspath in assembly).value 45 | cp filter { x => ignore.contains(x.data.getName) } 46 | } 47 | 48 | EclipseKeys.withSource := true 49 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=WARN, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | log4j.logger.org.apache.spark.metrics=ERROR 14 | log4j.logger.org.apache.spark.util=ERROR 15 | log4j.logger.org.apache.hadoop.util=ERROR 16 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0") 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" 2 | 3 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") 4 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") 5 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | build -------------------------------------------------------------------------------- /python/pyspark_elastic/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | This module provides python support for Apache Spark's Resillient Distributed Datasets from Elastic Search indices 15 | using the Spark / Hadoop support for Java / Scala from Elastic Search. 16 | """ 17 | 18 | import inspect 19 | 20 | import pyspark.context 21 | import pyspark.rdd 22 | 23 | from pyspark_elastic.context import EsSparkContext, monkey_patch_sc 24 | from pyspark_elastic.rdd import saveToEs, saveJsonToEs 25 | 26 | 27 | __all__ = [ 28 | "EsSparkContext", 29 | ] 30 | 31 | # Monkey patch the default python RDD so that it can be stored to Elastic Search as documents 32 | pyspark.rdd.RDD.saveToEs = saveToEs 33 | pyspark.rdd.RDD.saveJsonToEs = saveJsonToEs 34 | 35 | # Monkey patch the sc variable in the caller if any 36 | frame = inspect.currentframe().f_back 37 | # Go back at most 10 frames 38 | for _ in range(10): 39 | if not frame: 40 | break 41 | elif "sc" in frame.f_globals: 42 | monkey_patch_sc(frame.f_globals["sc"]) 43 | break 44 | else: 45 | frame = frame.f_back 46 | -------------------------------------------------------------------------------- /python/pyspark_elastic/context.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from functools import partial 15 | 16 | import pyspark.context 17 | from pyspark_elastic.rdd import EsRDD 18 | 19 | 20 | def monkey_patch_sc(sc): 21 | sc.__class__ = EsSparkContext 22 | sc.__dict__["esRDD"] = partial(EsSparkContext.esRDD, sc) 23 | sc.__dict__["esRDD"].__doc__ = EsSparkContext.esRDD.__doc__ 24 | 25 | 26 | class EsSparkContext(pyspark.context.SparkContext): 27 | 28 | def esRDD(self, resource_read=None, query='', **kwargs): 29 | return EsRDD(self, resource_read, query, **kwargs) 30 | -------------------------------------------------------------------------------- /python/pyspark_elastic/rdd.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | from pyspark.rdd import RDD 13 | from pyspark.serializers import NoOpSerializer 14 | from pyspark_elastic.types import as_java_object, AttrDict 15 | from pyspark_elastic.util import helper, make_es_config 16 | 17 | try: 18 | from itertools import izip 19 | except ImportError: 20 | izip = zip 21 | 22 | 23 | try: 24 | import cjson as json 25 | json.loads = json.decode 26 | json.dumps = json.encode 27 | except ImportError: 28 | try: 29 | import ujson as json 30 | except ImportError: 31 | import json 32 | 33 | 34 | 35 | class EsRDD(RDD): 36 | def __init__(self, ctx, resource_read=None, query=None, **kwargs): 37 | kwargs = make_es_config(kwargs, resource_read=resource_read, query=query) 38 | kwargs = as_java_object(ctx._gateway, kwargs) 39 | jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs) 40 | rdd = RDD(jrdd, ctx, NoOpSerializer()) 41 | 42 | # read the rdd in batches of two (first key then value / doc) 43 | def pairwise(iterable): 44 | iterator = iter(iterable) 45 | return izip(iterator, iterator) 46 | kvRdd = rdd.mapPartitions(pairwise, True) 47 | 48 | super(EsRDD, self).__init__(kvRdd._jrdd, ctx) 49 | 50 | def esCount(self): 51 | return helper(self.ctx).esCount(self._jrdd) 52 | 53 | def loads(self): 54 | return self.mapValues(lambda v: json.loads(v.decode('utf-8'))) 55 | 56 | def loadsObj(self): 57 | return self.mapValues(lambda v: AttrDict.loads(v.decode('utf-8'))) 58 | 59 | def saveToEs(rdd, resource_write=None, **kwargs): 60 | saveJsonToEs(rdd.map(json.dumps), resource_write=resource_write, **kwargs) 61 | 62 | def saveJsonToEs(rdd, resource_write=None, **kwargs): 63 | kwargs = make_es_config(kwargs, resource_write=resource_write) 64 | kwargs = as_java_object(rdd.ctx._gateway, kwargs) 65 | helper(rdd.ctx).saveJsonToEs(rdd._jrdd, kwargs) 66 | -------------------------------------------------------------------------------- /python/pyspark_elastic/tests.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | import unittest 4 | 5 | from elasticsearch.helpers import bulk 6 | from elasticsearch_dsl import Index 7 | from elasticsearch_dsl.connections import connections 8 | from elasticsearch_dsl.document import DocType 9 | from elasticsearch_dsl.field import String 10 | from pyspark.conf import SparkConf 11 | from pyspark.rdd import RDD 12 | from pyspark_elastic.context import EsSparkContext 13 | 14 | 15 | class PysparkElasticTestCase(unittest.TestCase): 16 | class TestDoc(DocType): 17 | title = String() 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | conf = SparkConf() 22 | conf.set('spark.ui.showConsoleProgress', 'false') 23 | cls.sc = EsSparkContext(conf=conf.setAppName("PySpark Elastic Test")) 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | cls.sc.stop() 28 | 29 | 30 | def setUp(self): 31 | self.index = index = Index('pyspark_elastic') 32 | index.settings(number_of_shards=4) 33 | index.create(ignore=400) 34 | 35 | index.doc_type(self.TestDoc) 36 | 37 | self.resource = self.index._name + '/' + self.TestDoc._doc_type.name 38 | 39 | def tearDown(self): 40 | self.index.delete() 41 | 42 | 43 | def rdd(self, query='', doc_type=None, cache=True, as_json=True, **kwargs): 44 | doc_type = doc_type or self.TestDoc._doc_type.name 45 | rdd = self.sc.esRDD(self.index._name + '/' + doc_type, query, **kwargs) 46 | if as_json: 47 | rdd = rdd.loads() 48 | if cache: 49 | rdd = rdd.cache() 50 | return rdd 51 | 52 | 53 | 54 | class TestsWithData(PysparkElasticTestCase): 55 | def setUp(self): 56 | super(TestsWithData, self).setUp() 57 | 58 | self.docs = [ 59 | self.TestDoc(title='doc-' + str(i)) 60 | for i in range(1000) 61 | ] 62 | 63 | actions = [d.to_dict(include_meta=True) for d in self.docs] 64 | 65 | inserted, errors = bulk(connections.get_connection(), actions=actions, refresh=True) 66 | self.assertEqual(inserted, len(actions)) 67 | self.assertEqual(len(errors), 0) 68 | 69 | 70 | 71 | class ReadTests(TestsWithData): 72 | def test_first(self): 73 | doc = self.rdd().first() 74 | self.assertTrue(doc != None) 75 | self.assertEqual(len(doc), 2) 76 | 77 | k, v = doc 78 | self.assertIsInstance(k, basestring) 79 | self.assertIsInstance(v, dict) 80 | self.assertEqual(len(v), 1) 81 | self.assertTrue('title' in v) 82 | 83 | title = v['title'] 84 | self.assertIsInstance(title, basestring) 85 | 86 | def test_take(self): 87 | self.assertEquals(len(self.rdd().take(10)), 10) 88 | 89 | def test_count(self): 90 | self.assertEquals(self.rdd().count(), len(self.docs)) 91 | 92 | def test_read_metadata(self): 93 | read = self.rdd( 94 | read_metadata=True, 95 | read_metadata_field='_meta', 96 | read_metadata_version=True, 97 | ).collect() 98 | 99 | for _, doc in read: 100 | self.assertIn('_meta', doc) 101 | meta = doc['_meta'] 102 | self.assertIn('_score', meta) 103 | self.assertIn('_index', meta) 104 | self.assertIn('_type', meta) 105 | self.assertIn('_id', meta) 106 | self.assertIn('_version', meta) 107 | 108 | def test_default_resource(self): 109 | self.assertEqual(self.rdd(resource=self.resource).count(), len(self.docs)) 110 | 111 | # es.index.read.missing.as.empty 112 | # def test_read_missing_index(self): 113 | 114 | # es.field.read.empty.as.null 115 | # def test_empty_fields(self): 116 | 117 | 118 | # class QueryTests(PysparkElasticTestCase): 119 | # test querying with ?uri_query 120 | # def test_uri_query(self): 121 | 122 | # test querying with { dsl } 123 | # def test_dsl_query(self): 124 | 125 | # test querying with an external json file containing the query dsl 126 | # def test_ext_res_query(self): 127 | 128 | # es.field.read.validate.presence 129 | # def test_query_check(self): 130 | 131 | 132 | class WriteTests(PysparkElasticTestCase): 133 | def setUp(self): 134 | super(WriteTests, self).setUp() 135 | self.docs = self.sc.parallelize(xrange(100)).map(lambda i: dict(title='doc-' + str(i))) 136 | 137 | def assertWritten(self, docs=None): 138 | docs = docs or self.docs 139 | if isinstance(docs, RDD): 140 | docs = docs.collect() 141 | read = self.rdd().collect() 142 | self.assertEqual(set(str(d[1]['title']) for d in read), set(str(d['title']) for d in docs)) 143 | return read 144 | 145 | 146 | def test_save_dicts(self): 147 | self.docs.saveToEs(self.resource) 148 | self.assertWritten() 149 | 150 | def test_save_json(self): 151 | self.docs.map(json.dumps).saveJsonToEs(self.resource) 152 | self.assertWritten() 153 | 154 | def test_save_binary_json(self): 155 | self.docs.map(lambda d: json.dumps(d).encode()).saveJsonToEs(self.resource) 156 | self.assertWritten() 157 | 158 | def test_save_with_id(self): 159 | self.docs = self.sc.parallelize(xrange(100)).map( 160 | lambda i: dict( 161 | id=str(i), 162 | title='doc-' + str(i), 163 | ) 164 | ) 165 | 166 | self.docs.saveToEs( 167 | self.index._name + '/' + self.TestDoc._doc_type.name, 168 | mapping_id='id' 169 | ) 170 | 171 | self.assertWritten() 172 | 173 | written = self.docs.collect() 174 | read = self.rdd().collectAsMap() 175 | 176 | self.assertEqual(len(written), len(read)) 177 | for doc in written: 178 | self.assertEqual(str(doc['title']), read[doc['id']]['title']) 179 | 180 | # def test_create(self): 181 | # pass 182 | # 183 | # def test_update(self): 184 | # pass 185 | # 186 | # def test_upsert(self): 187 | # pass 188 | # 189 | # def test_save_with_parent(self): 190 | # pass 191 | # 192 | # def test_save_with_version(self): 193 | # pass 194 | # 195 | # def test_save_with_routing(self): 196 | # pass 197 | # 198 | # def test_save_with_ttl(self): 199 | # pass 200 | # 201 | # def test_save_with_timestamp(self): 202 | # pass 203 | # 204 | # def test_save_include_fields(self): 205 | # # es.mapping.include 206 | # pass 207 | # 208 | 209 | def test_save_exclude_fields(self): 210 | docs = [ 211 | dict(title='1', body='a'), 212 | dict(title='2', body='b'), 213 | dict(title='1', body='c'), 214 | ] 215 | 216 | self.sc.parallelize(docs).saveToEs(self.resource, mapping_exclude='body') 217 | read = self.rdd().collect() 218 | self.assertEqual(len(read), 3) 219 | for doc in read: 220 | self.assertNotIn('body', doc) 221 | 222 | # def test_save_with_script(self): 223 | # # es.update.script 224 | # # es.update.script.lang 225 | # # es.update.script.params 226 | # pass 227 | # 228 | # TODO 229 | # def test_autocreate_index(self): 230 | # index = Index('pyspark_elastic_non_existing') 231 | # index.delete(ignore=404) 232 | # 233 | # def save(): 234 | # self.docs.saveToEs(index._name + '/doc_type', index_auto_create='no') 235 | # self.assertRaises(Exception, save) 236 | 237 | def test_default_resource(self): 238 | self.docs.saveToEs(resource=self.resource) 239 | self.assertWritten() 240 | 241 | def test_dynamic_resource(self): 242 | Index('test-1').delete(ignore=404) 243 | Index('test-2').delete(ignore=404) 244 | 245 | docs1 = [ 246 | dict(idx='test-1', body='something'), 247 | dict(idx='test-1', body='else'), 248 | ] 249 | docs2 = [ 250 | dict(idx='test-2', body='abra'), 251 | dict(idx='test-2', body='ca'), 252 | dict(idx='test-2', body='dabra'), 253 | ] 254 | 255 | self.sc.parallelize(docs1 + docs2).saveToEs(resource_write='{idx}/docs') 256 | self.assertEqual(self.sc.esRDD('test-1/docs').count(), 2) 257 | self.assertEqual(self.sc.esRDD('test-2/docs').count(), 3) 258 | 259 | self.assertEqual( 260 | set(d['body'] for d in self.sc.esRDD('test-1/docs').loads().collectAsMap().values()), 261 | set(d['body'] for d in docs1) 262 | ) 263 | 264 | def test_dynamic_resource_timestamp(self): 265 | Index('test-2015-11').delete(ignore=404) 266 | Index('test-2015-12').delete(ignore=404) 267 | 268 | docs_nov = [ 269 | dict(timestamp=datetime.fromtimestamp(1448363875).isoformat(), body='Lorem'), 270 | dict(timestamp=datetime.fromtimestamp(1448363876).isoformat(), body='ipsum'), 271 | dict(timestamp=datetime.fromtimestamp(1448363877).isoformat(), body='dolor'), 272 | ] 273 | 274 | docs_dec = [ 275 | dict(timestamp=datetime.fromtimestamp(1449400621).isoformat(), body='fee'), 276 | dict(timestamp=datetime.fromtimestamp(1449400622).isoformat(), body='fi'), 277 | dict(timestamp=datetime.fromtimestamp(1449400623).isoformat(), body='fo'), 278 | dict(timestamp=datetime.fromtimestamp(1449400623).isoformat(), body='fum'), 279 | ] 280 | 281 | self.sc.parallelize(docs_nov + docs_dec).saveToEs(resource_write='test-{timestamp:YYYY-MM}/docs') 282 | self.assertEqual(self.sc.esRDD('test-2015-11/docs').count(), 3) 283 | self.assertEqual(self.sc.esRDD('test-2015-12/docs').count(), 4) 284 | 285 | self.assertEqual( 286 | set(d['body'] for d in self.sc.esRDD('test-2015-11/docs').loads().collectAsMap().values()), 287 | set(d['body'] for d in docs_nov) 288 | ) 289 | 290 | # def test_serialization_configuration(self): 291 | # # es.batch.size.bytes 292 | # # es.batch.size.entries 293 | # # es.batch.write.refresh 294 | # # es.batch.write.retry.count 295 | # # es.batch.write.retry.wait 296 | # pass 297 | 298 | 299 | # class ConfTests(PysparkElasticTestCase): 300 | # 301 | # def test_timeout(self): 302 | # # es.http.timeout 303 | # pass 304 | # 305 | # def test_retries(self): 306 | # # es.http.timeout 307 | # pass 308 | # 309 | # def test_scroll_keepalive(self): 310 | # # es.scroll.keepalive 311 | # pass 312 | # 313 | # def test_scroll_size(self): 314 | # # es.scroll.size 315 | # pass 316 | # 317 | # def test_task_timeout(self): 318 | # # es.action.heart.beat.lead 319 | # pass 320 | # 321 | # 322 | # class SecurityTests(PysparkElasticTestCase): 323 | # def test_authentication(self): 324 | # # es.net.http.auth.user 325 | # # es.net.http.auth.pass 326 | # pass 327 | 328 | 329 | 330 | if __name__ == '__main__': 331 | connections.create_connection() 332 | unittest.main() 333 | # suite = unittest.TestLoader().loadTestsFromTestCase(PushDownTests) 334 | # unittest.TextTestRunner().run(suite) 335 | -------------------------------------------------------------------------------- /python/pyspark_elastic/types.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from collections import Set, Iterable, Mapping 14 | from datetime import datetime 15 | from json import loads 16 | from time import mktime 17 | 18 | 19 | def as_java_array(gateway, java_type, iterable): 20 | """Creates a Java array from a Python iterable, using the given p4yj gateway""" 21 | 22 | java_type = gateway.jvm.__getattr__(java_type) 23 | lst = list(iterable) 24 | arr = gateway.new_array(java_type, len(lst)) 25 | 26 | for i, e in enumerate(lst): 27 | jobj = as_java_object(gateway, e) 28 | arr[i] = jobj 29 | 30 | return arr 31 | 32 | 33 | def as_java_object(gateway, obj): 34 | """Converts a limited set of types to their corresponding types in java. Supported are 'primitives' (which aren't 35 | converted), datetime.datetime and the set-, dict- and iterable-like types. 36 | """ 37 | 38 | t = type(obj) 39 | 40 | if issubclass(t, (bool, int, float, str)): 41 | return obj 42 | 43 | elif issubclass(t, datetime): 44 | timestamp = int(mktime(obj.timetuple()) * 1000) 45 | return gateway.jvm.java.util.Date(timestamp) 46 | 47 | elif issubclass(t, (dict, Mapping)): 48 | hash_map = gateway.jvm.java.util.HashMap() 49 | for (k, v) in obj.items(): hash_map[k] = v 50 | return hash_map 51 | 52 | elif issubclass(t, (set, Set)): 53 | hash_set = gateway.jvm.java.util.HashSet() 54 | for e in obj: hash_set.add(e) 55 | return hash_set 56 | 57 | elif issubclass(t, (list, Iterable)): 58 | array_list = gateway.jvm.java.util.ArrayList() 59 | for e in obj: array_list.append(e) 60 | return array_list 61 | 62 | else: 63 | return obj 64 | 65 | 66 | class AttrDict(dict): 67 | def __getattr__(self, k): 68 | try: 69 | return self[k] 70 | except KeyError: 71 | raise AttributeError('no such attribute %r' % k) 72 | 73 | def __setattr__(self, k, v): 74 | self[k] = v 75 | 76 | def __delattr__(self, k): 77 | try: 78 | del self[k] 79 | except KeyError: 80 | raise AttributeError('no such attribute %r' % k) 81 | 82 | @staticmethod 83 | def loads(s): 84 | return loads(s, object_hook=AttrDict) 85 | -------------------------------------------------------------------------------- /python/pyspark_elastic/util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | _helper = None 14 | 15 | def helper(ctx): 16 | global _helper 17 | 18 | if not _helper: 19 | _helper = ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ 20 | .loadClass("pyspark_elastic.PythonHelper").newInstance() 21 | 22 | return _helper 23 | 24 | 25 | def make_es_config(d, **kwargs): 26 | cfg = {} 27 | add_es_config(cfg, d) 28 | add_es_config(cfg, kwargs) 29 | return cfg 30 | 31 | def add_es_config(cfg, d): 32 | for k, v in d.items(): 33 | if v is not None: 34 | cfg[make_es_param(k)] = str(v) 35 | 36 | def make_es_param(k): 37 | return 'es.' + k.replace('_', '.') 38 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | from setuptools import setup, find_packages 6 | 7 | 8 | basedir = os.path.dirname(os.path.abspath(__file__)) 9 | os.chdir(basedir) 10 | 11 | def f(*path): 12 | return open(os.path.join(basedir, *path)) 13 | 14 | setup( 15 | name='pyspark_elastic', 16 | maintainer='Frens Jan Rumph', 17 | maintainer_email='frens.jan.rumph@target-holding.nl', 18 | version='0.3.1', 19 | description='Utilities to asssist in working with Elastic Serach and PySpark.', 20 | long_description=f('../README.md').read(), 21 | url='https://github.com/TargetHolding/pyspark-elastic', 22 | license='Apache License 2.0', 23 | 24 | packages=find_packages(), 25 | include_package_data=True, 26 | 27 | classifiers=[ 28 | 'Development Status :: 2 - Pre-Alpha', 29 | 'Environment :: Other Environment', 30 | 'Framework :: Django', 31 | 'Intended Audience :: Developers', 32 | 'License :: OSI Approved :: Apache Software License', 33 | 'Operating System :: OS Independent', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 2', 36 | 'Programming Language :: Python :: 2.7', 37 | 'Topic :: Database', 38 | 'Topic :: Software Development :: Libraries', 39 | 'Topic :: Scientific/Engineering :: Information Analysis', 40 | 'Topic :: Utilities', 41 | ] 42 | ) 43 | -------------------------------------------------------------------------------- /sbin/local.sh: -------------------------------------------------------------------------------- 1 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/.. 2 | 3 | VERSION=`cat version.txt` 4 | 5 | PYSPARK_DRIVER_PYTHON=ipython \ 6 | $DIR/lib/spark-1.6.1-bin-hadoop2.6/bin/pyspark \ 7 | --conf spark.es.nodes=localhost \ 8 | --driver-memory 2g \ 9 | --master local[*] \ 10 | --jars $DIR/target/scala-2.10/pyspark-elastic-assembly-$VERSION.jar \ 11 | --py-files target/scala-2.10/pyspark-elastic-assembly-$VERSION.jar \ 12 | $@ 13 | -------------------------------------------------------------------------------- /sbin/released.sh: -------------------------------------------------------------------------------- 1 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/.. 2 | 3 | VERSION=`cat version.txt` 4 | 5 | PYSPARK_DRIVER_PYTHON=ipython \ 6 | $DIR/lib/spark-1.6.1-bin-hadoop2.6/bin/pyspark \ 7 | --conf spark.es.nodes=localhost \ 8 | --driver-memory 2g \ 9 | --master local[*] \ 10 | --packages TargetHolding/pyspark-elastic:$VERSION 11 | $@ 12 | 13 | -------------------------------------------------------------------------------- /src/main/scala/pyspark_elastic/PythonHelper.scala: -------------------------------------------------------------------------------- 1 | package pyspark_elastic 2 | 3 | import java.util.{ Map => JMap } 4 | import scala.collection.JavaConversions.mapAsScalaMap 5 | import org.apache.spark.api.java.JavaRDD 6 | import org.apache.spark.api.java.JavaSparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.elasticsearch.spark.sparkContextFunctions 9 | import org.elasticsearch.spark.sparkStringJsonRDDFunctions 10 | import pyspark_util.Pickling 11 | import pyspark_util.Pickling.toPickleableRDD 12 | import pyspark_util.Pickling.toUnpickleableRDD 13 | import org.elasticsearch.spark.rdd.ScalaEsRDD 14 | import org.elasticsearch.spark.rdd.ScalaEsRDD 15 | 16 | class PythonHelper() { 17 | 18 | implicit val pickling = new Pickling() 19 | 20 | def esJsonRDD(sc: JavaSparkContext, cfg: JMap[String, String]) = { 21 | val rdd = sc.sc.esJsonRDD(config(cfg)) 22 | JavaRDD.fromRDD(rdd) 23 | } 24 | 25 | def saveJsonToEs(rdd: JavaRDD[Array[Byte]], cfg: JMap[String, String]) = { 26 | rdd.rdd.unpickle().asInstanceOf[RDD[String]].saveJsonToEs(config(cfg)) 27 | } 28 | 29 | private[this] def config(cfg: JMap[String, String]) = { 30 | if (cfg != null) { 31 | mapAsScalaMap(cfg) 32 | } else { 33 | Map[String, String]() 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/pyspark_util: -------------------------------------------------------------------------------- 1 | ../../../pyspark-util/src/main/scala/pyspark_util/ -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.4.2 2 | --------------------------------------------------------------------------------