├── .gitignore ├── .idea └── vcs.xml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── __main__.py ├── doc ├── Makefile ├── _static │ └── .gitignore ├── conf.py ├── index.rst └── make.bat ├── javaobj.py ├── setup.py ├── test.2.bin ├── test.3.bin └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | dist: trusty 3 | sudo: required 4 | 5 | before_script: 6 | - pip3 install . 7 | - pip install . 8 | 9 | script: 10 | - python3 test.py 11 | - python test.py 12 | 13 | notifications: 14 | email: false -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | 204 | ======================================================================= 205 | Apache Spark Subcomponents: 206 | 207 | The Apache Spark project contains subcomponents with separate copyright 208 | notices and license terms. Your use of the source code for the these 209 | subcomponents is subject to the terms and conditions of the following 210 | licenses. 211 | 212 | 213 | ======================================================================== 214 | For heapq (pyspark/heapq3.py): 215 | ======================================================================== 216 | 217 | See license/LICENSE-heapq.txt 218 | 219 | ======================================================================== 220 | For SnapTree: 221 | ======================================================================== 222 | 223 | See license/LICENSE-SnapTree.txt 224 | 225 | ======================================================================== 226 | For jbcrypt: 227 | ======================================================================== 228 | 229 | See license/LICENSE-jbcrypt.txt 230 | 231 | ======================================================================== 232 | BSD-style licenses 233 | ======================================================================== 234 | 235 | The following components are provided under a BSD-style license. See project link for details. 236 | The text of each license is also included at licenses/LICENSE-[project].txt. 237 | 238 | (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core) 239 | (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model) 240 | (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/) 241 | (BSD License) ANTLR 4.5.2-1 (org.antlr:antlr4:4.5.2-1 - http://wwww.antlr.org/) 242 | (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org) 243 | (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org) 244 | (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org) 245 | (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net) 246 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer) 247 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer) 248 | (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License) 249 | (Interpreter classes (all .scala files in repl/src/main/scala 250 | except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala), 251 | and for SerializableMapWrapper in JavaUtils.scala) 252 | (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.7 - http://www.scala-lang.org/) 253 | (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.7 - http://www.scala-lang.org/) 254 | (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.7 - http://www.scala-lang.org/) 255 | (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.7 - http://www.scala-lang.org/) 256 | (BSD-like) Scalap (org.scala-lang:scalap:2.11.7 - http://www.scala-lang.org/) 257 | (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org) 258 | (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org) 259 | (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org) 260 | (New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo) 261 | (New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog) 262 | (New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf) 263 | (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf) 264 | (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net) 265 | (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net) 266 | (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/) 267 | (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/) 268 | (BSD licence) sbt and sbt-launch-lib.bash 269 | (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE) 270 | (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE) 271 | (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE) 272 | 273 | ======================================================================== 274 | MIT licenses 275 | ======================================================================== 276 | 277 | The following components are provided under the MIT License. See project link for details. 278 | The text of each license is also included at licenses/LICENSE-[project].txt. 279 | 280 | (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org) 281 | (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org) 282 | (MIT License) SLF4J API Module (org.slf4j:slf4j-api:1.7.5 - http://www.slf4j.org) 283 | (MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org) 284 | (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/) 285 | (MIT License) scopt (com.github.scopt:scopt_2.11:3.2.0 - https://github.com/scopt/scopt) 286 | (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org) 287 | (MIT License) jquery (https://jquery.org/license/) 288 | (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs) 289 | (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot) 290 | (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3) 291 | (MIT License) sorttable (https://github.com/stuartlangridge/sorttable) 292 | (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE) 293 | (MIT License) datatables (http://datatables.net/license) 294 | (MIT License) mustache (https://github.com/mustache/mustache/blob/master/LICENSE) 295 | (MIT License) cookies (http://code.google.com/p/cookies/wiki/License) 296 | (MIT License) blockUI (http://jquery.malsup.com/block/) 297 | (MIT License) RowsGroup (http://datatables.net/license/mit) 298 | (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html) 299 | (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE) 300 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude ./test.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/src-d/sparkpickle.svg?branch=master)](https://travis-ci.org/src-d/sparkpickle) [![PyPI](https://img.shields.io/pypi/v/sparkpickle.svg)](https://pypi.python.org/pypi/sparkpickle) 2 | 3 | SparkPickle 4 | =========== 5 | 6 | Pure Python implementation of reading SequenceFile-s with pickles written by 7 | Spark's [saveAsPickleFile()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.saveAsPickleFile). 8 | This is needed if you store the results from Spark in the efficient binary pickle 9 | format and want to load them locally on your computer, without any Spark installation, 10 | given only the actual files. 11 | 12 | [Article about creating this project.](https://blog.sourced.tech/post/reading_pyspark_pickles_locally) 13 | 14 | Installation 15 | ------------ 16 | ``` 17 | pip install sparkpickle 18 | ``` 19 | Supports Python 2.7 and 3.x. 20 | 21 | Usage 22 | ----- 23 | View the contents of the file via command line: 24 | ``` 25 | python -m sparkpickle /path/to/file 26 | ``` 27 | 28 | Code: 29 | ```python 30 | import sparkpickle 31 | 32 | for obj in sparkpickle.load_gen("/path/to/file"): 33 | print(obj) 34 | ``` 35 | 36 | API 37 | --- 38 | There are 3 functions: `load()`, `loads()` and `load_gen()`. The first two 39 | are similar to those found in "pickle" package, whereas the last one is the 40 | generator which yields deserialized objects and thus provides the minimal 41 | memory footprint. 42 | 43 | License 44 | ------- 45 | Apache 2.0. 46 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides functions for reading `SequenceFile `_-s 3 | with Python pickles. Such files are usually created with 4 | :meth:`pyspark.rdd.RDD.saveAsPickleFile()`. 5 | No PySpark installation is required, no external dependencies. 6 | 7 | References: 8 | https://blog.sourced.tech/post/reading_pyspark_pickles_locally 9 | https://wiki.apache.org/hadoop/SequenceFile 10 | http://grepcode.com/file/repo1.maven.org/maven2/org.apache.hadoop/hadoop-common/2.7.1/org/apache/hadoop/io/SequenceFile.java#SequenceFile 11 | https://www.safaribooksonline.com/library/view/hadoop-the-definitive/9781449328917/ch04.html#id3960971 12 | https://docs.oracle.com/javase/7/docs/platform/serialization/spec/protocol.html#10258 13 | http://www.javaworld.com/article/2072752/the-java-serialization-algorithm-revealed.html 14 | 15 | :authors: Vadim Markovtsev 16 | :version: 1.0 17 | :status: Alpha 18 | :license: Apache License 2.0 19 | 20 | .. code-block:: none 21 | 22 | Copyright 2016 source{d} 23 | 24 | Licensed under the Apache License, Version 2.0 (the "License"); 25 | you may not use this file except in compliance with the License. 26 | You may obtain a copy of the License at 27 | 28 | http://www.apache.org/licenses/LICENSE-2.0 29 | 30 | Unless required by applicable law or agreed to in writing, software 31 | distributed under the License is distributed on an "AS IS" BASIS, 32 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 33 | See the License for the specific language governing permissions and 34 | limitations under the License. 35 | """ 36 | 37 | from io import BytesIO 38 | import pickle 39 | import struct 40 | 41 | from .javaobj import load as javaobj_load 42 | 43 | __all__ = ("load", "load_gen", "loads", "FormatError") 44 | 45 | 46 | HEADER = b"\x53\x45\x51\x06\x21\x6F\x72\x67\x2E\x61\x70\x61\x63\x68\x65\x2E" \ 47 | b"\x68\x61\x64\x6F\x6F\x70\x2E\x69\x6F\x2E\x4E\x75\x6C\x6C\x57\x72" \ 48 | b"\x69\x74\x61\x62\x6C\x65\x22\x6F\x72\x67\x2E\x61\x70\x61\x63\x68" \ 49 | b"\x65\x2E\x68\x61\x64\x6F\x6F\x70\x2E\x69\x6F\x2E\x42\x79\x74\x65" \ 50 | b"\x73\x57\x72\x69\x74\x61\x62\x6C\x65\x00\x00\x00\x00\x00\x00" 51 | 52 | 53 | class FormatError(Exception): 54 | """ 55 | Represents any errors related to sparkpickle. 56 | """ 57 | pass 58 | 59 | 60 | def load_gen(file, progress_callback=None): 61 | """ 62 | Loads all the objects from the specified Spark SequenceFile with pickles 63 | (generator version of load()). 64 | The file is expected to be created with saveAsPickleFile() in PySpark. 65 | All the imported Python classes must be present in the current environment. 66 | 67 | :param file: `File object `_ \ 68 | which is open in binary mode ("rb") and must be able to \ 69 | read(), seek() and tell(). 70 | :param progress_callback: Optional :func:`callable` to report the loading \ 71 | progress. It must accept a single argument which is the current \ 72 | file position. 73 | :return: The generator object. Every object is yield-ed while reading. 74 | :raises FormatError: something is wrong with the supplied binary file. 75 | 76 | Example:: 77 | 78 | with open("/path/to/file", "rb") as f: 79 | for obj in sparkpickle.load_gen(f): 80 | print(obj) 81 | """ 82 | header = file.read(len(HEADER)) 83 | if header != HEADER: 84 | raise FormatError("Header validation failed.") 85 | mark = file.read(16) # sync mark 86 | record_flag = None 87 | while True: 88 | if record_flag is None and not file.read(4): 89 | break 90 | record_flag = None 91 | if file.read(4) != b"\x00\x00\x00\x00": 92 | raise FormatError("Record validation failed.") 93 | object_size = file.read(4) 94 | try: 95 | object_size = struct.unpack(">I", object_size)[0] 96 | except ValueError: 97 | raise FormatError("Failed to parse BytesWritable.") 98 | object_start_pos = file.tell() 99 | batches = [] 100 | 101 | def callback(_, size): 102 | pos = file.tell() 103 | batches.append(pickle.load(file)) 104 | if file.tell() - pos != size: 105 | raise FormatError("Object stream parsing integrity error.") 106 | if progress_callback is not None: 107 | progress_callback(pos + size) 108 | 109 | javaobj_load(file, ignore_remaining_data=True, bytes_callback=callback) 110 | if file.tell() - object_start_pos != object_size: 111 | raise FormatError("Object stream parsing integrity error.") 112 | for batch in batches: 113 | for obj in batch: 114 | yield obj 115 | del batches[:] 116 | probe = file.read(4) 117 | if probe == b"\xFF\xFF\xFF\xFF": 118 | if file.read(16) != mark: 119 | raise FormatError("Object stream parsing integrity error.") 120 | elif not probe: 121 | break 122 | else: 123 | record_flag = probe 124 | 125 | 126 | def load(file, progress_callback=None): 127 | """ 128 | Loads all the objects from the specified Spark `SequenceFile `_ 129 | with pickles. The file is expected to be created with 130 | :meth:`pyspark.RDD.saveAsPickleFile()` in PySpark. 131 | All the imported Python classes must be present in the current environment. 132 | 133 | :param file: `File object `_ \ 134 | which is open in binary mode ("rb") and must be able to \ 135 | read(), seek() and tell(). 136 | :param progress_callback: Optional :func:`callable` to report the loading\ 137 | progress. It must accept a single argument which is the current \ 138 | file position. 139 | :return: The list with the loaded objects. Internal batches are flattened. 140 | """ 141 | return list(load_gen(file, progress_callback=progress_callback)) 142 | 143 | 144 | def loads(buffer, progress_callback=None): 145 | """ 146 | Loads all the objects from the specified Spark `SequenceFile `_ 147 | with pickles. The file is expected to be created with 148 | :meth:`pyspark.RDD.saveAsPickleFile()` in PySpark. 149 | All the imported Python classes must be present in the current environment. 150 | 151 | :param buffer: The contents of the file. 152 | :type buffer: bytes 153 | :param progress_callback: Optional :func:`callable` to report the loading \ 154 | progress. It must accept a single argument which is the current \ 155 | file position. 156 | :return: The list with the loaded objects. Internal batches are flattened. 157 | """ 158 | return load(BytesIO(buffer), progress_callback=progress_callback) 159 | -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple CL application which prints the contents inside a Spark Pickle RDD. 3 | """ 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | import sparkpickle 9 | 10 | 11 | def main(): 12 | with open(sys.argv[1], "rb") as fin: 13 | i = 0 14 | t = None 15 | for obj in sparkpickle.load_gen(fin): 16 | t = type(obj) 17 | print(obj) 18 | i += 1 19 | print("-" * 80) 20 | print("Overall: %d objects of type %s" % (i, t)) 21 | 22 | if __name__ == "__main__": 23 | sys.exit(main()) 24 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | # You can set these variables from the command line. 4 | SPHINXOPTS ?= 5 | SPHINXBUILD ?= python3 -msphinx 6 | SPHINXPROJ ?= wmd-relax 7 | SOURCEDIR ?= . 8 | BUILDDIR ?= _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /doc/_static/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/doc/_static/.gitignore -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # sparkpickle documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jun 5 16:52:34 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | sys.path.insert(0, os.path.abspath('..')) 22 | sys.path.insert(0, os.path.abspath('../..')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.mathjax', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.intersphinx'] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix(es) of source filenames. 43 | # You can specify multiple suffix as a list of string: 44 | # 45 | # source_suffix = ['.rst', '.md'] 46 | source_suffix = '.rst' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # General information about the project. 52 | project = u'sparkpickle' 53 | copyright = u'2017, Vadim Markovtsev' 54 | author = u'Vadim Markovtsev' 55 | 56 | # The version info for the project you're documenting, acts as replacement for 57 | # |version| and |release|, also used in various other places throughout the 58 | # built documents. 59 | # 60 | # The short X.Y version. 61 | version = u'master' 62 | # The full version, including alpha/beta/rc tags. 63 | release = u'master' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # 68 | # This is also used if you do content translation via gettext catalogs. 69 | # Usually you set "language" from the command line for these cases. 70 | language = None 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This patterns also effect to html_static_path and html_extra_path 75 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = 'sphinx' 79 | 80 | # If true, `todo` and `todoList` produce output, else they produce nothing. 81 | todo_include_todos = False 82 | 83 | 84 | # -- Options for HTML output ---------------------------------------------- 85 | 86 | # The theme to use for HTML and HTML Help pages. See the documentation for 87 | # a list of builtin themes. 88 | # 89 | html_theme = 'alabaster' 90 | 91 | # Theme options are theme-specific and customize the look and feel of a theme 92 | # further. For a list of options available for each theme, see the 93 | # documentation. 94 | # 95 | # html_theme_options = {} 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = ['_static'] 101 | 102 | 103 | # -- Options for HTMLHelp output ------------------------------------------ 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'sparkpickledoc' 107 | 108 | 109 | # -- Options for LaTeX output --------------------------------------------- 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'sparkpickle.tex', u'sparkpickle Documentation', 134 | u'Vadim Markovtsev', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output --------------------------------------- 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'sparkpickle', u'sparkpickle Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'sparkpickle', u'sparkpickle Documentation', 155 | author, 'sparkpickle', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | autodoc_default_flags = ['members', 'undoc-members', 'show-inheritance'] 160 | 161 | intersphinx_mapping = { 162 | 'python': ('https://docs.python.org/3.6', None), 163 | 'spark': ('http://spark.apache.org/docs/latest/api/python/', None)} 164 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. wmd-relax documentation master file, created by 2 | sphinx-quickstart on Mon Jun 5 16:52:34 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | sparkpickle's documentation 7 | =========================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | .. automodule:: sparkpickle 14 | :members: 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=wmd-relax 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /javaobj.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -- Content-Encoding: UTF-8 -- 3 | """ 4 | Provides functions for reading and writing (writing is WIP currently) Java 5 | objects serialized or will be deserialized by ObjectOutputStream. This form of 6 | object representation is a standard data interchange format in Java world. 7 | 8 | javaobj module exposes an API familiar to users of the standard library 9 | marshal, pickle and json modules. 10 | 11 | See: 12 | http://download.oracle.com/javase/6/docs/platform/serialization/spec/protocol.html 13 | 14 | :authors: Volodymyr Buell, Thomas Calmant 15 | :license: Apache License 2.0 16 | :version: 0.2.2 17 | :status: Alpha 18 | 19 | .. 20 | 21 | Copyright 2016 Thomas Calmant 22 | 23 | Licensed under the Apache License, Version 2.0 (the "License"); 24 | you may not use this file except in compliance with the License. 25 | You may obtain a copy of the License at 26 | 27 | http://www.apache.org/licenses/LICENSE-2.0 28 | 29 | Unless required by applicable law or agreed to in writing, software 30 | distributed under the License is distributed on an "AS IS" BASIS, 31 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 32 | See the License for the specific language governing permissions and 33 | limitations under the License. 34 | """ 35 | 36 | # Standard library 37 | import collections 38 | import logging 39 | import os 40 | import struct 41 | import sys 42 | 43 | try: 44 | # Python 2 45 | from StringIO import StringIO as BytesIO 46 | except ImportError: 47 | # Python 3+ 48 | from io import BytesIO 49 | 50 | # ------------------------------------------------------------------------------ 51 | 52 | # Module version 53 | __version_info__ = (0, 2, 2) 54 | __version__ = ".".join(str(x) for x in __version_info__) 55 | 56 | # Documentation strings format 57 | __docformat__ = "restructuredtext en" 58 | 59 | # ------------------------------------------------------------------------------ 60 | 61 | # Setup the logger 62 | _log = logging.getLogger(__name__) 63 | 64 | 65 | def log_debug(message, ident=0): 66 | """ 67 | Logs a message at debug level 68 | 69 | :param message: Message to log 70 | :param ident: Number of indentation spaces 71 | """ 72 | _log.debug(" " * (ident * 2) + str(message)) 73 | 74 | 75 | def log_error(message, ident=0): 76 | """ 77 | Logs a message at error level 78 | 79 | :param message: Message to log 80 | :param ident: Number of indentation spaces 81 | """ 82 | _log.error(" " * (ident * 2) + str(message)) 83 | 84 | # ------------------------------------------------------------------------------ 85 | 86 | if sys.version_info[0] >= 3: 87 | # Python 3 interpreter : bytes & str 88 | def to_bytes(data, encoding="UTF-8"): 89 | """ 90 | Converts the given string to an array of bytes. 91 | Returns the first parameter if it is already an array of bytes. 92 | 93 | :param data: A unicode string 94 | :param encoding: The encoding of data 95 | :return: The corresponding array of bytes 96 | """ 97 | if type(data) is bytes: 98 | # Nothing to do 99 | return data 100 | return data.encode(encoding) 101 | 102 | def to_str(data, encoding="UTF-8"): 103 | """ 104 | Converts the given parameter to a string. 105 | Returns the first parameter if it is already an instance of ``str``. 106 | 107 | :param data: A string 108 | :param encoding: The encoding of data 109 | :return: The corresponding string 110 | """ 111 | if type(data) is str: 112 | # Nothing to do 113 | return data 114 | return str(data, encoding) 115 | 116 | def read_to_str(data): 117 | """ 118 | Concats all bytes into a string 119 | """ 120 | return ''.join(chr(char) for char in data) 121 | 122 | else: 123 | # Python 2 interpreter : str & unicode 124 | def to_str(data, encoding="UTF-8"): 125 | """ 126 | Converts the given parameter to a string. 127 | Returns the first parameter if it is already an instance of ``str``. 128 | 129 | :param data: A string 130 | :param encoding: The encoding of data 131 | :return: The corresponding string 132 | """ 133 | if type(data) is str: 134 | # Nothing to do 135 | return data 136 | return data.encode(encoding) 137 | 138 | # Same operation 139 | to_bytes = to_str 140 | 141 | def read_to_str(data): 142 | """ 143 | Nothing to do in Python 2 144 | """ 145 | return data 146 | 147 | # ------------------------------------------------------------------------------ 148 | 149 | 150 | def load(file_object, *transformers, **kwargs): 151 | """ 152 | Deserializes Java primitive data and objects serialized using 153 | ObjectOutputStream from a file-like object. 154 | 155 | :param file_object: A file-like object 156 | :param transformers: Custom transformers to use 157 | :param ignore_remaining_data: If True, don't log an error when unused 158 | trailing bytes are remaining 159 | :return: The deserialized object 160 | """ 161 | # Read keyword argument 162 | ignore_remaining_data = kwargs.get('ignore_remaining_data', False) 163 | 164 | marshaller = JavaObjectUnmarshaller( 165 | file_object, kwargs.get('bytes_callback')) 166 | 167 | # Add custom transformers first 168 | for transformer in transformers: 169 | marshaller.add_transformer(transformer) 170 | marshaller.add_transformer(DefaultObjectTransformer()) 171 | 172 | # Read the file object 173 | return marshaller.readObject(ignore_remaining_data=ignore_remaining_data) 174 | 175 | 176 | def loads(string, *transformers, **kwargs): 177 | """ 178 | Deserializes Java objects and primitive data serialized using 179 | ObjectOutputStream from a string. 180 | 181 | :param string: A Java data string 182 | :param transformers: Custom transformers to use 183 | :param ignore_remaining_data: If True, don't log an error when unused 184 | trailing bytes are remaining 185 | :return: The deserialized object 186 | """ 187 | # Read keyword argument 188 | ignore_remaining_data = kwargs.get('ignore_remaining_data', False) 189 | 190 | # Reuse the load method (avoid code duplication) 191 | return load(BytesIO(string), *transformers, 192 | ignore_remaining_data=ignore_remaining_data) 193 | 194 | 195 | def dumps(obj, *transformers): 196 | """ 197 | Serializes Java primitive data and objects unmarshaled by load(s) before 198 | into string. 199 | 200 | :param obj: A Python primitive object, or one loaded using load(s) 201 | :param transformers: Custom transformers to use 202 | :return: The serialized data as a string 203 | """ 204 | marshaller = JavaObjectMarshaller() 205 | # Add custom transformers 206 | for transformer in transformers: 207 | marshaller.add_transformer(transformer) 208 | 209 | return marshaller.dump(obj) 210 | 211 | # ------------------------------------------------------------------------------ 212 | 213 | 214 | class JavaClass(object): 215 | """ 216 | Represents a class in the Java world 217 | """ 218 | def __init__(self): 219 | """ 220 | Sets up members 221 | """ 222 | self.name = None 223 | self.serialVersionUID = None 224 | self.flags = None 225 | self.fields_names = [] 226 | self.fields_types = [] 227 | self.superclass = None 228 | 229 | def __str__(self): 230 | """ 231 | String representation of the Java class 232 | """ 233 | return self.__repr__() 234 | 235 | def __repr__(self): 236 | """ 237 | String representation of the Java class 238 | """ 239 | return "[{0:s}:0x{1:X}]".format(self.name, self.serialVersionUID) 240 | 241 | def __eq__(self, other): 242 | """ 243 | Equality test between two Java classes 244 | 245 | :param other: Other JavaClass to test 246 | :return: True if both classes share the same fields and name 247 | """ 248 | if not isinstance(other, type(self)): 249 | return False 250 | 251 | return (self.name == other.name and 252 | self.serialVersionUID == other.serialVersionUID and 253 | self.flags == other.flags and 254 | self.fields_names == other.fields_names and 255 | self.fields_types == other.fields_types and 256 | self.superclass == other.superclass) 257 | 258 | 259 | class JavaObject(object): 260 | """ 261 | Represents a deserialized non-primitive Java object 262 | """ 263 | def __init__(self): 264 | """ 265 | Sets up members 266 | """ 267 | self.classdesc = None 268 | self.annotations = [] 269 | 270 | def get_class(self): 271 | """ 272 | Returns the JavaClass that defines the type of this object 273 | """ 274 | return self.classdesc 275 | 276 | def __str__(self): 277 | """ 278 | String representation 279 | """ 280 | return self.__repr__() 281 | 282 | def __repr__(self): 283 | """ 284 | String representation 285 | """ 286 | name = "UNKNOWN" 287 | if self.classdesc: 288 | name = self.classdesc.name 289 | return "".format(name) 290 | 291 | def __eq__(self, other): 292 | """ 293 | Equality test between two Java classes 294 | 295 | :param other: Other JavaClass to test 296 | :return: True if both classes share the same fields and name 297 | """ 298 | if not isinstance(other, type(self)): 299 | return False 300 | 301 | res = (self.classdesc == other.classdesc and 302 | self.annotations == other.annotations) 303 | if not res: 304 | return False 305 | 306 | for name in self.classdesc.fields_names: 307 | if not (getattr(self, name) == getattr(other, name)): 308 | return False 309 | return True 310 | 311 | 312 | class JavaString(str): 313 | """ 314 | Represents a Java String 315 | """ 316 | def __hash__(self): 317 | return str.__hash__(self) 318 | 319 | def __eq__(self, other): 320 | if not isinstance(other, str): 321 | return False 322 | return str.__eq__(self, other) 323 | 324 | 325 | class JavaEnum(JavaObject): 326 | """ 327 | Represents a Java enumeration 328 | """ 329 | def __init__(self, constant=None): 330 | super(JavaEnum, self).__init__() 331 | self.constant = constant 332 | 333 | 334 | class JavaArray(list, JavaObject): 335 | """ 336 | Represents a Java Array 337 | """ 338 | def __init__(self, classdesc=None): 339 | list.__init__(self) 340 | JavaObject.__init__(self) 341 | self.classdesc = classdesc 342 | 343 | # ------------------------------------------------------------------------------ 344 | 345 | 346 | class JavaObjectConstants(object): 347 | """ 348 | Defines the constants of the Java serialization format 349 | """ 350 | STREAM_MAGIC = 0xaced 351 | STREAM_VERSION = 0x05 352 | 353 | TC_NULL = 0x70 354 | TC_REFERENCE = 0x71 355 | TC_CLASSDESC = 0x72 356 | TC_OBJECT = 0x73 357 | TC_STRING = 0x74 358 | TC_ARRAY = 0x75 359 | TC_CLASS = 0x76 360 | TC_BLOCKDATA = 0x77 361 | TC_ENDBLOCKDATA = 0x78 362 | TC_RESET = 0x79 363 | TC_BLOCKDATALONG = 0x7A 364 | TC_EXCEPTION = 0x7B 365 | TC_LONGSTRING = 0x7C 366 | TC_PROXYCLASSDESC = 0x7D 367 | TC_ENUM = 0x7E 368 | # Ignore TC_MAX: we don't use it and it messes with TC_ENUM 369 | # TC_MAX = 0x7E 370 | 371 | # classDescFlags 372 | SC_WRITE_METHOD = 0x01 # if SC_SERIALIZABLE 373 | SC_BLOCK_DATA = 0x08 # if SC_EXTERNALIZABLE 374 | SC_SERIALIZABLE = 0x02 375 | SC_EXTERNALIZABLE = 0x04 376 | SC_ENUM = 0x10 377 | 378 | # type definition chars (typecode) 379 | TYPE_BYTE = 'B' # 0x42 380 | TYPE_CHAR = 'C' # 0x43 381 | TYPE_DOUBLE = 'D' # 0x44 382 | TYPE_FLOAT = 'F' # 0x46 383 | TYPE_INTEGER = 'I' # 0x49 384 | TYPE_LONG = 'J' # 0x4A 385 | TYPE_SHORT = 'S' # 0x53 386 | TYPE_BOOLEAN = 'Z' # 0x5A 387 | TYPE_OBJECT = 'L' # 0x4C 388 | TYPE_ARRAY = '[' # 0x5B 389 | 390 | # list of supported typecodes listed above 391 | TYPECODES_LIST = [ 392 | # primitive types 393 | TYPE_BYTE, 394 | TYPE_CHAR, 395 | TYPE_DOUBLE, 396 | TYPE_FLOAT, 397 | TYPE_INTEGER, 398 | TYPE_LONG, 399 | TYPE_SHORT, 400 | TYPE_BOOLEAN, 401 | # object types 402 | TYPE_OBJECT, 403 | TYPE_ARRAY] 404 | 405 | BASE_REFERENCE_IDX = 0x7E0000 406 | 407 | 408 | class OpCodeDebug(object): 409 | # Type codes 410 | OP_CODE = dict((getattr(JavaObjectConstants, key), key) 411 | for key in dir(JavaObjectConstants) 412 | if key.startswith("TC_")) 413 | 414 | TYPE = dict((getattr(JavaObjectConstants, key), key) 415 | for key in dir(JavaObjectConstants) 416 | if key.startswith("TYPE_")) 417 | 418 | STREAM_CONSTANT = dict((getattr(JavaObjectConstants, key), key) 419 | for key in dir(JavaObjectConstants) 420 | if key.startswith("SC_")) 421 | 422 | @staticmethod 423 | def op_id(op_id): 424 | return OpCodeDebug.OP_CODE.get( 425 | op_id, "".format(op_id)) 426 | 427 | @staticmethod 428 | def type_code(type_id): 429 | return OpCodeDebug.TYPE.get( 430 | type_id, "".format(type_id)) 431 | 432 | @staticmethod 433 | def flags(flags): 434 | names = sorted( 435 | descr for key, descr in OpCodeDebug.STREAM_CONSTANT.items() 436 | if key & flags) 437 | return ', '.join(names) 438 | 439 | 440 | # ------------------------------------------------------------------------------ 441 | 442 | 443 | class JavaObjectUnmarshaller(JavaObjectConstants): 444 | """ 445 | Deserializes a Java serialization stream 446 | """ 447 | def __init__(self, stream, bytes_callback=None): 448 | """ 449 | Sets up members 450 | 451 | :param stream: An input stream (opened in binary/bytes mode) 452 | :raise IOError: Invalid input stream 453 | """ 454 | self.bytes_callback = bytes_callback 455 | 456 | # Check stream 457 | if stream is None: 458 | raise IOError("No input stream given") 459 | 460 | # Prepare the association Terminal Symbol -> Reading method 461 | self.opmap = { 462 | self.TC_NULL: self.do_null, 463 | self.TC_CLASSDESC: self.do_classdesc, 464 | self.TC_OBJECT: self.do_object, 465 | self.TC_STRING: self.do_string, 466 | self.TC_LONGSTRING: self.do_string_long, 467 | self.TC_ARRAY: self.do_array, 468 | self.TC_CLASS: self.do_class, 469 | self.TC_BLOCKDATA: self.do_blockdata, 470 | self.TC_BLOCKDATALONG: self.do_blockdata_long, 471 | self.TC_REFERENCE: self.do_reference, 472 | self.TC_ENUM: self.do_enum, 473 | # note that we are reusing do_null: 474 | self.TC_ENDBLOCKDATA: self.do_null, 475 | } 476 | 477 | # Set up members 478 | self.current_object = None 479 | self.reference_counter = 0 480 | self.references = [] 481 | self.object_transformers = [] 482 | self.object_stream = stream 483 | 484 | # Read the stream header (magic & version) 485 | self._readStreamHeader() 486 | 487 | def readObject(self, ignore_remaining_data=False): 488 | """ 489 | Reads an object from the input stream 490 | 491 | :param ignore_remaining_data: If True, don't log an error when 492 | unused trailing bytes are remaining 493 | :return: The unmarshalled object 494 | :raise Exception: Any exception that occurred during unmarshalling 495 | """ 496 | try: 497 | # TODO: add expects 498 | _, res = self._read_and_exec_opcode(ident=0) 499 | log_debug("Java Object unmarshalled successfully!") 500 | return res 501 | except Exception: 502 | self._oops_dump_state(ignore_remaining_data) 503 | raise 504 | 505 | def add_transformer(self, transformer): 506 | """ 507 | Appends an object transformer to the deserialization process 508 | 509 | :param transformer: An object with a transform(obj) method 510 | """ 511 | self.object_transformers.append(transformer) 512 | 513 | def _readStreamHeader(self): 514 | """ 515 | Reads the magic header of a Java serialization stream 516 | 517 | :raise IOError: Invalid magic header (not a Java stream) 518 | """ 519 | (magic, version) = self._readStruct(">HH") 520 | if magic != self.STREAM_MAGIC or version != self.STREAM_VERSION: 521 | raise IOError("The stream is not java serialized object. " 522 | "Invalid stream header: {0:04X}{1:04X}" 523 | .format(magic, version)) 524 | 525 | def _read_and_exec_opcode(self, ident=0, expect=None): 526 | """ 527 | Reads the next opcode, and executes its handler 528 | 529 | :param ident: Log identation level 530 | :param expect: A list of expected opcodes 531 | :return: A tuple: (opcode, result of the handler) 532 | :raise IOError: Read opcode is not one of the expected ones 533 | :raise RuntimeError: Unknown opcode 534 | """ 535 | position = self.object_stream.tell() 536 | (opid,) = self._readStruct(">B") 537 | log_debug("OpCode: 0x{0:X} -- {1} (at offset 0x{2:X})" 538 | .format(opid, OpCodeDebug.op_id(opid), position), ident) 539 | 540 | if expect and opid not in expect: 541 | raise IOError( 542 | "Unexpected opcode 0x{0:X} -- {1} (at offset 0x{2:X})" 543 | .format(opid, OpCodeDebug.op_id(opid), position)) 544 | 545 | try: 546 | handler = self.opmap[opid] 547 | except KeyError: 548 | raise RuntimeError( 549 | "Unknown OpCode in the stream: 0x{0:X} (at offset 0x{1:X})" 550 | .format(opid, position)) 551 | else: 552 | return opid, handler(ident=ident) 553 | 554 | def _readStruct(self, unpack): 555 | """ 556 | Reads from the input stream, using struct 557 | 558 | :param unpack: An unpack format string 559 | :return: The result of struct.unpack (tuple) 560 | :raise RuntimeError: End of stream reached during unpacking 561 | """ 562 | length = struct.calcsize(unpack) 563 | ba = self.object_stream.read(length) 564 | 565 | if len(ba) != length: 566 | raise RuntimeError("Stream has been ended unexpectedly while " 567 | "unmarshaling.") 568 | 569 | return struct.unpack(unpack, ba) 570 | 571 | def _readString(self, length_fmt="H"): 572 | """ 573 | Reads a serialized string 574 | 575 | :param length_fmt: Structure format of the string length (H or Q) 576 | :return: The deserialized string 577 | :raise RuntimeError: Unexpected end of stream 578 | """ 579 | (length,) = self._readStruct(">{0}".format(length_fmt)) 580 | ba = self.object_stream.read(length) 581 | return to_str(ba) 582 | 583 | def do_classdesc(self, parent=None, ident=0): 584 | """ 585 | Handles a TC_CLASSDESC opcode 586 | 587 | :param parent: 588 | :param ident: Log indentation level 589 | :return: A JavaClass object 590 | """ 591 | # TC_CLASSDESC className serialVersionUID newHandle classDescInfo 592 | # classDescInfo: 593 | # classDescFlags fields classAnnotation superClassDesc 594 | # classDescFlags: 595 | # (byte) // Defined in Terminal Symbols and Constants 596 | # fields: 597 | # (short) fieldDesc[count] 598 | 599 | # fieldDesc: 600 | # primitiveDesc 601 | # objectDesc 602 | # primitiveDesc: 603 | # prim_typecode fieldName 604 | # objectDesc: 605 | # obj_typecode fieldName className1 606 | clazz = JavaClass() 607 | log_debug("[classdesc]", ident) 608 | class_name = self._readString() 609 | clazz.name = class_name 610 | log_debug("Class name: %s" % class_name, ident) 611 | 612 | # serialVersionUID is a Java (signed) long => 8 bytes 613 | serialVersionUID, classDescFlags = self._readStruct(">qB") 614 | clazz.serialVersionUID = serialVersionUID 615 | clazz.flags = classDescFlags 616 | 617 | self._add_reference(clazz, ident) 618 | 619 | log_debug("Serial: 0x{0:X} / {0:d} - classDescFlags: 0x{1:X} {2}" 620 | .format(serialVersionUID, classDescFlags, 621 | OpCodeDebug.flags(classDescFlags)), ident) 622 | (length,) = self._readStruct(">H") 623 | log_debug("Fields num: 0x{0:X}".format(length), ident) 624 | 625 | clazz.fields_names = [] 626 | clazz.fields_types = [] 627 | for fieldId in range(length): 628 | (typecode,) = self._readStruct(">B") 629 | field_name = self._readString() 630 | field_type = self._convert_char_to_type(typecode) 631 | 632 | log_debug("> Reading field {0}".format(field_name), ident) 633 | 634 | if field_type == self.TYPE_ARRAY: 635 | _, field_type = self._read_and_exec_opcode( 636 | ident=ident + 1, 637 | expect=(self.TC_STRING, self.TC_REFERENCE)) 638 | 639 | if type(field_type) is not JavaString: 640 | raise AssertionError("Field type must be a JavaString, " 641 | "not {0}".format(type(field_type))) 642 | 643 | elif field_type == self.TYPE_OBJECT: 644 | _, field_type = self._read_and_exec_opcode( 645 | ident=ident + 1, 646 | expect=(self.TC_STRING, self.TC_REFERENCE)) 647 | 648 | if type(field_type) is JavaClass: 649 | # FIXME: ugly trick 650 | field_type = JavaString(field_type.name) 651 | 652 | if type(field_type) is not JavaString: 653 | raise AssertionError("Field type must be a JavaString, " 654 | "not {0}".format(type(field_type))) 655 | 656 | log_debug("< FieldName: 0x{0:X} Name:{1} Type:{2} ID:{3}" 657 | .format(typecode, field_name, field_type, fieldId), 658 | ident) 659 | assert field_name is not None 660 | assert field_type is not None 661 | 662 | clazz.fields_names.append(field_name) 663 | clazz.fields_types.append(field_type) 664 | 665 | if parent: 666 | parent.__fields = clazz.fields_names 667 | parent.__types = clazz.fields_types 668 | 669 | # classAnnotation 670 | (opid,) = self._readStruct(">B") 671 | log_debug("OpCode: 0x{0:X} -- {1} (classAnnotation)" 672 | .format(opid, OpCodeDebug.op_id(opid)), ident) 673 | if opid != self.TC_ENDBLOCKDATA: 674 | raise NotImplementedError("classAnnotation isn't implemented yet") 675 | 676 | # superClassDesc 677 | log_debug("Reading Super Class of {0}".format(clazz.name), ident) 678 | _, superclassdesc = self._read_and_exec_opcode( 679 | ident=ident + 1, 680 | expect=(self.TC_CLASSDESC, self.TC_NULL, self.TC_REFERENCE)) 681 | log_debug("Super Class for {0}: {1}" 682 | .format(clazz.name, str(superclassdesc)), ident) 683 | clazz.superclass = superclassdesc 684 | return clazz 685 | 686 | def do_blockdata(self, parent=None, ident=0): 687 | """ 688 | Handles TC_BLOCKDATA opcode 689 | 690 | :param parent: 691 | :param ident: Log indentation level 692 | :return: A string containing the block data 693 | """ 694 | # TC_BLOCKDATA (unsigned byte) (byte)[size] 695 | log_debug("[blockdata]", ident) 696 | (length,) = self._readStruct(">B") 697 | ba = self.object_stream.read(length) 698 | 699 | # Ensure we have an str 700 | return read_to_str(ba) 701 | 702 | def do_blockdata_long(self, parent=None, ident=0): 703 | """ 704 | Handles TC_BLOCKDATALONG opcode 705 | 706 | :param parent: 707 | :param ident: Log indentation level 708 | :return: A string containing the block data 709 | """ 710 | # TC_BLOCKDATALONG (int) (byte)[size] 711 | log_debug("[blockdatalong]", ident) 712 | (length,) = self._readStruct(">I") 713 | ba = self.object_stream.read(length) 714 | 715 | # Ensure we have an str 716 | return read_to_str(ba) 717 | 718 | def do_class(self, parent=None, ident=0): 719 | """ 720 | Handles TC_CLASS opcode 721 | 722 | :param parent: 723 | :param ident: Log indentation level 724 | :return: A JavaClass object 725 | """ 726 | # TC_CLASS classDesc newHandle 727 | log_debug("[class]", ident) 728 | 729 | # TODO: what to do with "(ClassDesc)prevObject". 730 | # (see 3rd line for classDesc:) 731 | _, classdesc = self._read_and_exec_opcode( 732 | ident=ident + 1, 733 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC, 734 | self.TC_NULL, self.TC_REFERENCE)) 735 | log_debug("Classdesc: {0}".format(classdesc), ident) 736 | self._add_reference(classdesc, ident) 737 | return classdesc 738 | 739 | def do_object(self, parent=None, ident=0): 740 | """ 741 | Handles a TC_OBJECT opcode 742 | 743 | :param parent: 744 | :param ident: Log indentation level 745 | :return: A JavaClass object 746 | """ 747 | # TC_OBJECT classDesc newHandle classdata[] // data for each class 748 | java_object = JavaObject() 749 | log_debug("[object]", ident) 750 | log_debug("java_object.annotations just after instantiation: {0}" 751 | .format(java_object.annotations), ident) 752 | 753 | # TODO: what to do with "(ClassDesc)prevObject". 754 | # (see 3rd line for classDesc:) 755 | opcode, classdesc = self._read_and_exec_opcode( 756 | ident=ident + 1, 757 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC, 758 | self.TC_NULL, self.TC_REFERENCE)) 759 | # self.TC_REFERENCE hasn't shown in spec, but actually is here 760 | 761 | # Create object 762 | for transformer in self.object_transformers: 763 | java_object = transformer.create(classdesc) 764 | if java_object: 765 | break 766 | 767 | # Store classdesc of this object 768 | java_object.classdesc = classdesc 769 | 770 | # Store the reference 771 | self._add_reference(java_object, ident) 772 | 773 | # classdata[] 774 | 775 | if classdesc.flags & self.SC_EXTERNALIZABLE \ 776 | and not classdesc.flags & self.SC_BLOCK_DATA: 777 | # TODO: 778 | raise NotImplementedError("externalContents isn't implemented yet") 779 | 780 | if classdesc.flags & self.SC_SERIALIZABLE: 781 | # TODO: look at ObjectInputStream.readSerialData() 782 | # FIXME: Handle the SC_WRITE_METHOD flag 783 | 784 | # create megalist 785 | tempclass = classdesc 786 | megalist = [] 787 | megatypes = [] 788 | log_debug("Constructing class...", ident) 789 | while tempclass: 790 | log_debug("Class: {0}".format(tempclass.name), ident + 1) 791 | class_fields_str = ' - '.join( 792 | ' '.join((field_type, field_name)) 793 | for field_type, field_name 794 | in zip(tempclass.fields_types, tempclass.fields_names)) 795 | if class_fields_str: 796 | log_debug(class_fields_str, ident + 2) 797 | 798 | fieldscopy = tempclass.fields_names[:] 799 | fieldscopy.extend(megalist) 800 | megalist = fieldscopy 801 | 802 | fieldscopy = tempclass.fields_types[:] 803 | fieldscopy.extend(megatypes) 804 | megatypes = fieldscopy 805 | 806 | tempclass = tempclass.superclass 807 | 808 | log_debug("Values count: {0}".format(len(megalist)), ident) 809 | log_debug("Prepared list of values: {0}".format(megalist), ident) 810 | log_debug("Prepared list of types: {0}".format(megatypes), ident) 811 | 812 | for field_name, field_type in zip(megalist, megatypes): 813 | log_debug("Reading field: {0} - {1}" 814 | .format(field_type, field_name)) 815 | res = self._read_value(field_type, ident, name=field_name) 816 | java_object.__setattr__(field_name, res) 817 | 818 | if classdesc.flags & self.SC_SERIALIZABLE \ 819 | and classdesc.flags & self.SC_WRITE_METHOD \ 820 | or classdesc.flags & self.SC_EXTERNALIZABLE \ 821 | and classdesc.flags & self.SC_BLOCK_DATA: 822 | # objectAnnotation 823 | log_debug("java_object.annotations before: {0}" 824 | .format(java_object.annotations), ident) 825 | 826 | while opcode != self.TC_ENDBLOCKDATA: 827 | opcode, obj = self._read_and_exec_opcode(ident=ident + 1) 828 | # , expect=[self.TC_ENDBLOCKDATA, self.TC_BLOCKDATA, 829 | # self.TC_OBJECT, self.TC_NULL, self.TC_REFERENCE]) 830 | if opcode != self.TC_ENDBLOCKDATA: 831 | java_object.annotations.append(obj) 832 | 833 | log_debug("objectAnnotation value: {0}".format(obj), ident) 834 | 835 | log_debug("java_object.annotations after: {0}" 836 | .format(java_object.annotations), ident) 837 | 838 | log_debug(">>> java_object: {0}".format(java_object), ident) 839 | return java_object 840 | 841 | def do_string(self, parent=None, ident=0): 842 | """ 843 | Handles a TC_STRING opcode 844 | 845 | :param parent: 846 | :param ident: Log indentation level 847 | :return: A string 848 | """ 849 | log_debug("[string]", ident) 850 | ba = JavaString(self._readString()) 851 | self._add_reference(ba, ident) 852 | return ba 853 | 854 | def do_string_long(self, parent=None, ident=0): 855 | """ 856 | Handles a TC_LONGSTRING opcode 857 | 858 | :param parent: 859 | :param ident: Log indentation level 860 | :return: A string 861 | """ 862 | log_debug("[long string]", ident) 863 | ba = JavaString(self._readString("Q")) 864 | self._add_reference(ba, ident) 865 | return ba 866 | 867 | def do_array(self, parent=None, ident=0): 868 | """ 869 | Handles a TC_ARRAY opcode 870 | 871 | :param parent: 872 | :param ident: Log indentation level 873 | :return: A list of deserialized objects 874 | """ 875 | # TC_ARRAY classDesc newHandle (int) values[size] 876 | log_debug("[array]", ident) 877 | _, classdesc = self._read_and_exec_opcode( 878 | ident=ident + 1, 879 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC, 880 | self.TC_NULL, self.TC_REFERENCE)) 881 | 882 | array = JavaArray(classdesc) 883 | 884 | self._add_reference(array, ident) 885 | 886 | (size,) = self._readStruct(">i") 887 | log_debug("size: {0}".format(size), ident) 888 | 889 | type_char = classdesc.name[0] 890 | assert type_char == self.TYPE_ARRAY 891 | type_char = classdesc.name[1] 892 | 893 | if type_char == self.TYPE_OBJECT or type_char == self.TYPE_ARRAY: 894 | for _ in range(size): 895 | _, res = self._read_and_exec_opcode(ident=ident + 1) 896 | log_debug("Object value: {0}".format(res), ident) 897 | array.append(res) 898 | elif type_char == self.TYPE_BYTE: 899 | if self.bytes_callback is not None: 900 | array = self.bytes_callback(self.object_stream, size) 901 | else: 902 | array = self.object_stream.read(size) 903 | else: 904 | for _ in range(size): 905 | res = self._read_value(type_char, ident) 906 | log_debug("Native value: {0}".format(res), ident) 907 | array.append(res) 908 | 909 | return array 910 | 911 | def do_reference(self, parent=None, ident=0): 912 | """ 913 | Handles a TC_REFERENCE opcode 914 | 915 | :param parent: 916 | :param ident: Log indentation level 917 | :return: The referenced object 918 | """ 919 | (handle,) = self._readStruct(">L") 920 | log_debug("## Reference handle: 0x{0:X}".format(handle), ident) 921 | ref = self.references[handle - self.BASE_REFERENCE_IDX] 922 | log_debug("###-> Type: {0} - Value: {1}".format(type(ref), ref), ident) 923 | return ref 924 | 925 | @staticmethod 926 | def do_null(parent=None, ident=0): 927 | """ 928 | Handles a TC_NULL opcode 929 | 930 | :param parent: 931 | :param ident: Log indentation level 932 | :return: Always None 933 | """ 934 | return None 935 | 936 | def do_enum(self, parent=None, ident=0): 937 | """ 938 | Handles a TC_ENUM opcode 939 | 940 | :param parent: 941 | :param ident: Log indentation level 942 | :return: A JavaEnum object 943 | """ 944 | # TC_ENUM classDesc newHandle enumConstantName 945 | enum = JavaEnum() 946 | _, classdesc = self._read_and_exec_opcode( 947 | ident=ident + 1, 948 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC, 949 | self.TC_NULL, self.TC_REFERENCE)) 950 | enum.classdesc = classdesc 951 | self._add_reference(enum, ident) 952 | _, enumConstantName = self._read_and_exec_opcode( 953 | ident=ident + 1, expect=(self.TC_STRING, self.TC_REFERENCE)) 954 | enum.constant = enumConstantName 955 | return enum 956 | 957 | @staticmethod 958 | def _create_hexdump(src, start_offset=0, length=16): 959 | """ 960 | Prepares an hexadecimal dump string 961 | 962 | :param src: A string containing binary data 963 | :param start_offset: The start offset of the source 964 | :param length: Length of a dump line 965 | :return: A dump string 966 | """ 967 | FILTER = ''.join((len(repr(chr(x))) == 3) and chr(x) or '.' 968 | for x in range(256)) 969 | pattern = "{{0:04X}} {{1:<{0}}} {{2}}\n".format(length * 3) 970 | 971 | # Convert raw data to str (Python 3 compatibility) 972 | src = to_str(src, 'latin-1') 973 | 974 | result = [] 975 | for i in range(0, len(src), length): 976 | s = src[i:i + length] 977 | hexa = ' '.join("{0:02X}".format(ord(x)) for x in s) 978 | printable = s.translate(FILTER) 979 | result.append(pattern.format(i + start_offset, hexa, printable)) 980 | 981 | return ''.join(result) 982 | 983 | def _read_value(self, field_type, ident, name=""): 984 | """ 985 | Reads the next value, of the given type 986 | 987 | :param field_type: A serialization typecode 988 | :param ident: Log indentation 989 | :param name: Field name (for logs) 990 | :return: The read value 991 | :raise RuntimeError: Unknown field type 992 | """ 993 | if len(field_type) > 1: 994 | # We don't need details for arrays and objects 995 | field_type = field_type[0] 996 | 997 | if field_type == self.TYPE_BOOLEAN: 998 | (val,) = self._readStruct(">B") 999 | res = bool(val) 1000 | elif field_type == self.TYPE_BYTE: 1001 | (res,) = self._readStruct(">b") 1002 | elif field_type == self.TYPE_CHAR: 1003 | (res,) = self._readStruct(">c") 1004 | elif field_type == self.TYPE_SHORT: 1005 | (res,) = self._readStruct(">h") 1006 | elif field_type == self.TYPE_INTEGER: 1007 | (res,) = self._readStruct(">i") 1008 | elif field_type == self.TYPE_LONG: 1009 | (res,) = self._readStruct(">q") 1010 | elif field_type == self.TYPE_FLOAT: 1011 | (res,) = self._readStruct(">f") 1012 | elif field_type == self.TYPE_DOUBLE: 1013 | (res,) = self._readStruct(">d") 1014 | elif field_type == self.TYPE_OBJECT or field_type == self.TYPE_ARRAY: 1015 | _, res = self._read_and_exec_opcode(ident=ident + 1) 1016 | else: 1017 | raise RuntimeError("Unknown typecode: {0}".format(field_type)) 1018 | 1019 | log_debug("* {0} {1}: {2}".format(field_type, name, res), ident) 1020 | return res 1021 | 1022 | def _convert_char_to_type(self, type_char): 1023 | """ 1024 | Ensures a read character is a typecode. 1025 | 1026 | :param type_char: Read typecode 1027 | :return: The typecode as a string (using chr) 1028 | :raise RuntimeError: Unknown typecode 1029 | """ 1030 | typecode = type_char 1031 | if type(type_char) is int: 1032 | typecode = chr(type_char) 1033 | 1034 | if typecode in self.TYPECODES_LIST: 1035 | return typecode 1036 | else: 1037 | raise RuntimeError("Typecode {0} ({1}) isn't supported." 1038 | .format(type_char, typecode)) 1039 | 1040 | def _add_reference(self, obj, ident=0): 1041 | """ 1042 | Adds a read reference to the marshaler storage 1043 | 1044 | :param obj: Reference to add 1045 | :param ident: Log indentation level 1046 | """ 1047 | log_debug("## New reference handle 0x{0:X}: {1} -> {2}" 1048 | .format(len(self.references) + self.BASE_REFERENCE_IDX, 1049 | type(obj).__name__, obj), ident) 1050 | self.references.append(obj) 1051 | 1052 | def _oops_dump_state(self, ignore_remaining_data=False): 1053 | """ 1054 | Log a deserialization error 1055 | 1056 | :param ignore_remaining_data: If True, don't log an error when 1057 | unused trailing bytes are remaining 1058 | """ 1059 | log_error("==Oops state dump" + "=" * (30 - 17)) 1060 | log_error("References: {0}".format(self.references)) 1061 | log_error("Stream seeking back at -16 byte (2nd line is an actual " 1062 | "position!):") 1063 | 1064 | # Do not use a keyword argument 1065 | self.object_stream.seek(-16, os.SEEK_CUR) 1066 | position = self.object_stream.tell() 1067 | the_rest = self.object_stream.read() 1068 | 1069 | if not ignore_remaining_data and len(the_rest): 1070 | log_error("Warning!!!!: Stream still has {0} bytes left." 1071 | .format(len(the_rest))) 1072 | log_error(self._create_hexdump(the_rest, position)) 1073 | 1074 | log_error("=" * 30) 1075 | 1076 | # ------------------------------------------------------------------------------ 1077 | 1078 | 1079 | class JavaObjectMarshaller(JavaObjectConstants): 1080 | """ 1081 | Serializes objects into Java serialization format 1082 | """ 1083 | def __init__(self, stream=None): 1084 | """ 1085 | Sets up members 1086 | 1087 | :param stream: An output stream 1088 | """ 1089 | self.object_stream = stream 1090 | self.object_obj = None 1091 | self.object_transformers = [] 1092 | self.references = [] 1093 | 1094 | def add_transformer(self, transformer): 1095 | """ 1096 | Appends an object transformer to the serialization process 1097 | 1098 | :param transformer: An object with a transform(obj) method 1099 | """ 1100 | self.object_transformers.append(transformer) 1101 | 1102 | def dump(self, obj): 1103 | """ 1104 | Dumps the given object in the Java serialization format 1105 | """ 1106 | self.references = [] 1107 | self.object_obj = obj 1108 | self.object_stream = BytesIO() 1109 | self._writeStreamHeader() 1110 | self.writeObject(obj) 1111 | return self.object_stream.getvalue() 1112 | 1113 | def _writeStreamHeader(self): 1114 | """ 1115 | Writes the Java serialization magic header in the serialization stream 1116 | """ 1117 | self._writeStruct(">HH", 4, (self.STREAM_MAGIC, self.STREAM_VERSION)) 1118 | 1119 | def writeObject(self, obj): 1120 | """ 1121 | Appends an object to the serialization stream 1122 | 1123 | :param obj: A string or a deserialized Java object 1124 | :raise RuntimeError: Unsupported type 1125 | """ 1126 | log_debug("Writing object of type {0}".format(type(obj).__name__)) 1127 | if isinstance(obj, JavaArray): 1128 | # Deserialized Java array 1129 | self.write_array(obj) 1130 | elif isinstance(obj, JavaEnum): 1131 | # Deserialized Java Enum 1132 | self.write_enum(obj) 1133 | elif isinstance(obj, JavaObject): 1134 | # Deserialized Java object 1135 | self.write_object(obj) 1136 | elif isinstance(obj, JavaString): 1137 | # Deserialized String 1138 | self.write_string(obj) 1139 | elif isinstance(obj, JavaClass): 1140 | # Java class 1141 | self.write_class(obj) 1142 | elif obj is None: 1143 | # Null 1144 | self.write_null() 1145 | elif type(obj) is str: 1146 | # String value 1147 | self.write_blockdata(obj) 1148 | else: 1149 | # Unhandled type 1150 | raise RuntimeError("Object serialization of type {0} is not " 1151 | "supported.".format(type(obj))) 1152 | 1153 | def _writeStruct(self, unpack, length, args): 1154 | """ 1155 | Appends data to the serialization stream 1156 | 1157 | :param unpack: Struct format string 1158 | :param length: Unused 1159 | :param args: Struct arguments 1160 | """ 1161 | ba = struct.pack(unpack, *args) 1162 | self.object_stream.write(ba) 1163 | 1164 | def _writeString(self, obj, use_reference=True): 1165 | """ 1166 | Appends a string to the serialization stream 1167 | 1168 | :param obj: String to serialize 1169 | :param use_reference: If True, allow writing a reference 1170 | """ 1171 | # TODO: Convert to "modified UTF-8" 1172 | # http://docs.oracle.com/javase/7/docs/api/java/io/DataInput.html#modified-utf-8 1173 | string = to_bytes(obj, "utf-8") 1174 | 1175 | if use_reference and isinstance(obj, JavaString): 1176 | try: 1177 | idx = self.references.index(obj) 1178 | except ValueError: 1179 | # First appearance of the string 1180 | self.references.append(obj) 1181 | logging.debug( 1182 | "*** Adding ref 0x%X for string: %s", 1183 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj) 1184 | 1185 | self._writeStruct(">H", 2, (len(string),)) 1186 | self.object_stream.write(string) 1187 | else: 1188 | # Write a reference to the previous type 1189 | logging.debug("*** Reusing ref 0x%X for string: %s", 1190 | idx + self.BASE_REFERENCE_IDX, obj) 1191 | self.write_reference(idx) 1192 | else: 1193 | self._writeStruct(">H", 2, (len(string),)) 1194 | self.object_stream.write(string) 1195 | 1196 | def write_string(self, obj, use_reference=True): 1197 | """ 1198 | Writes a Java string with the TC_STRING type marker 1199 | 1200 | :param obj: The string to print 1201 | :param use_reference: If True, allow writing a reference 1202 | """ 1203 | if use_reference and isinstance(obj, JavaString): 1204 | try: 1205 | idx = self.references.index(obj) 1206 | except ValueError: 1207 | # String is not referenced: let _writeString store it 1208 | self._writeStruct(">B", 1, (self.TC_STRING,)) 1209 | self._writeString(obj, use_reference) 1210 | else: 1211 | # Reuse the referenced string 1212 | logging.debug("*** Reusing ref 0x%X for String: %s", 1213 | idx + self.BASE_REFERENCE_IDX, obj) 1214 | self.write_reference(idx) 1215 | else: 1216 | # Don't use references 1217 | self._writeStruct(">B", 1, (self.TC_STRING,)) 1218 | self._writeString(obj, use_reference) 1219 | 1220 | def write_enum(self, obj): 1221 | """ 1222 | Writes an Enum value 1223 | 1224 | :param obj: A JavaEnum object 1225 | """ 1226 | # FIXME: the output doesn't have the same references as the real 1227 | # serializable form 1228 | self._writeStruct(">B", 1, (self.TC_ENUM,)) 1229 | 1230 | try: 1231 | idx = self.references.index(obj) 1232 | except ValueError: 1233 | # New reference 1234 | self.references.append(obj) 1235 | logging.debug( 1236 | "*** Adding ref 0x%X for enum: %s", 1237 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj) 1238 | 1239 | self.write_classdesc(obj.get_class()) 1240 | else: 1241 | self.write_reference(idx) 1242 | 1243 | self.write_string(obj.constant) 1244 | 1245 | def write_blockdata(self, obj, parent=None): 1246 | """ 1247 | Appends a block of data to the serialization stream 1248 | 1249 | :param obj: String form of the data block 1250 | """ 1251 | if type(obj) is str: 1252 | # Latin-1: keep bytes as is 1253 | obj = to_bytes(obj, "latin-1") 1254 | 1255 | length = len(obj) 1256 | if length <= 256: 1257 | # Small block data 1258 | # TC_BLOCKDATA (unsigned byte) (byte)[size] 1259 | self._writeStruct(">B", 1, (self.TC_BLOCKDATA,)) 1260 | self._writeStruct(">B", 1, (length,)) 1261 | else: 1262 | # Large block data 1263 | # TC_BLOCKDATALONG (unsigned int) (byte)[size] 1264 | self._writeStruct(">B", 1, (self.TC_BLOCKDATALONG,)) 1265 | self._writeStruct(">I", 1, (length,)) 1266 | 1267 | self.object_stream.write(obj) 1268 | 1269 | def write_null(self): 1270 | """ 1271 | Writes a "null" value 1272 | """ 1273 | self._writeStruct(">B", 1, (self.TC_NULL,)) 1274 | 1275 | def write_object(self, obj, parent=None): 1276 | """ 1277 | Writes an object header to the serialization stream 1278 | 1279 | :param obj: Not yet used 1280 | :param parent: Not yet used 1281 | """ 1282 | # Transform object 1283 | for transformer in self.object_transformers: 1284 | tmp_object = transformer.transform(obj) 1285 | if tmp_object is not obj: 1286 | obj = tmp_object 1287 | break 1288 | 1289 | self._writeStruct(">B", 1, (self.TC_OBJECT,)) 1290 | cls = obj.get_class() 1291 | self.write_classdesc(cls) 1292 | 1293 | # Add reference 1294 | self.references.append([]) 1295 | logging.debug( 1296 | "*** Adding ref 0x%X for object %s", 1297 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj) 1298 | 1299 | all_names = collections.deque() 1300 | all_types = collections.deque() 1301 | tmpcls = cls 1302 | while tmpcls: 1303 | all_names.extendleft(reversed(tmpcls.fields_names)) 1304 | all_types.extendleft(reversed(tmpcls.fields_types)) 1305 | tmpcls = tmpcls.superclass 1306 | del tmpcls 1307 | 1308 | logging.debug("<=> Field names: %s", all_names) 1309 | logging.debug("<=> Field types: %s", all_types) 1310 | 1311 | for field_name, field_type in zip(all_names, all_types): 1312 | try: 1313 | logging.debug("Writing field %s (%s): %s", 1314 | field_name, field_type, getattr(obj, field_name)) 1315 | self._write_value(field_type, getattr(obj, field_name)) 1316 | except AttributeError as ex: 1317 | log_error("No attribute {0} for object {1}\nDir: {2}" 1318 | .format(ex, repr(obj), dir(obj))) 1319 | raise 1320 | del all_names, all_types 1321 | 1322 | if cls.flags & self.SC_SERIALIZABLE \ 1323 | and cls.flags & self.SC_WRITE_METHOD \ 1324 | or cls.flags & self.SC_EXTERNALIZABLE \ 1325 | and cls.flags & self.SC_BLOCK_DATA: 1326 | for annotation in obj.annotations: 1327 | log_debug("Write annotation {0} for {1}" 1328 | .format(repr(annotation), repr(obj))) 1329 | if annotation is None: 1330 | self.write_null() 1331 | else: 1332 | self.writeObject(annotation) 1333 | self._writeStruct('>B', 1, (self.TC_ENDBLOCKDATA,)) 1334 | 1335 | def write_class(self, obj, parent=None): 1336 | """ 1337 | Writes a class to the stream 1338 | 1339 | :param obj: A JavaClass object 1340 | :param parent: 1341 | """ 1342 | self._writeStruct(">B", 1, (self.TC_CLASS,)) 1343 | self.write_classdesc(obj) 1344 | 1345 | def write_classdesc(self, obj, parent=None): 1346 | """ 1347 | Writes a class description 1348 | 1349 | :param obj: Class description to write 1350 | :param parent: 1351 | """ 1352 | if obj not in self.references: 1353 | # Add reference 1354 | self.references.append(obj) 1355 | logging.debug( 1356 | "*** Adding ref 0x%X for classdesc %s", 1357 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj.name) 1358 | 1359 | self._writeStruct(">B", 1, (self.TC_CLASSDESC,)) 1360 | self._writeString(obj.name) 1361 | self._writeStruct(">qB", 1, (obj.serialVersionUID, obj.flags)) 1362 | self._writeStruct(">H", 1, (len(obj.fields_names),)) 1363 | 1364 | for field_name, field_type \ 1365 | in zip(obj.fields_names, obj.fields_types): 1366 | self._writeStruct( 1367 | ">B", 1, (self._convert_type_to_char(field_type),)) 1368 | self._writeString(field_name) 1369 | if field_type[0] in (self.TYPE_OBJECT, self.TYPE_ARRAY): 1370 | try: 1371 | idx = self.references.index(field_type) 1372 | except ValueError: 1373 | # First appearance of the type 1374 | self.references.append(field_type) 1375 | logging.debug( 1376 | "*** Adding ref 0x%X for field type %s", 1377 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, 1378 | field_type) 1379 | 1380 | self.write_string(field_type, False) 1381 | else: 1382 | # Write a reference to the previous type 1383 | logging.debug("*** Reusing ref 0x%X for %s (%s)", 1384 | idx + self.BASE_REFERENCE_IDX, 1385 | field_type, field_name) 1386 | self.write_reference(idx) 1387 | 1388 | self._writeStruct(">B", 1, (self.TC_ENDBLOCKDATA,)) 1389 | if obj.superclass: 1390 | self.write_classdesc(obj.superclass) 1391 | else: 1392 | self.write_null() 1393 | else: 1394 | # Use reference 1395 | self.write_reference(self.references.index(obj)) 1396 | 1397 | def write_reference(self, ref_index): 1398 | """ 1399 | Writes a reference 1400 | :param ref_index: Local index (0-based) to the reference 1401 | """ 1402 | self._writeStruct( 1403 | ">BL", 1, (self.TC_REFERENCE, ref_index + self.BASE_REFERENCE_IDX)) 1404 | 1405 | def write_array(self, obj): 1406 | """ 1407 | Writes a JavaArray 1408 | 1409 | :param obj: A JavaArray object 1410 | """ 1411 | classdesc = obj.get_class() 1412 | self._writeStruct(">B", 1, (self.TC_ARRAY,)) 1413 | self.write_classdesc(classdesc) 1414 | self._writeStruct(">i", 1, (len(obj),)) 1415 | 1416 | # Add reference 1417 | self.references.append(obj) 1418 | logging.debug( 1419 | "*** Adding ref 0x%X for array []", 1420 | len(self.references) - 1 + self.BASE_REFERENCE_IDX) 1421 | 1422 | type_char = classdesc.name[0] 1423 | assert type_char == self.TYPE_ARRAY 1424 | type_char = classdesc.name[1] 1425 | 1426 | if type_char == self.TYPE_OBJECT: 1427 | for o in obj: 1428 | self._write_value(classdesc.name[1:], o) 1429 | elif type_char == self.TYPE_ARRAY: 1430 | for a in obj: 1431 | self.write_array(a) 1432 | else: 1433 | log_debug("Write array of type %s" % type_char) 1434 | for v in obj: 1435 | self._write_value(type_char, v) 1436 | 1437 | def _write_value(self, field_type, value): 1438 | """ 1439 | Writes an item of an array 1440 | 1441 | :param field_type: Value type 1442 | :param value: The value itself 1443 | """ 1444 | if len(field_type) > 1: 1445 | # We don't need details for arrays and objects 1446 | field_type = field_type[0] 1447 | 1448 | if field_type == self.TYPE_BOOLEAN: 1449 | self._writeStruct(">B", 1, (1 if value else 0,)) 1450 | elif field_type == self.TYPE_BYTE: 1451 | self._writeStruct(">b", 1, (value,)) 1452 | elif field_type == self.TYPE_SHORT: 1453 | self._writeStruct(">h", 1, (value,)) 1454 | elif field_type == self.TYPE_INTEGER: 1455 | self._writeStruct(">i", 1, (value,)) 1456 | elif field_type == self.TYPE_LONG: 1457 | self._writeStruct(">q", 1, (value,)) 1458 | elif field_type == self.TYPE_FLOAT: 1459 | self._writeStruct(">f", 1, (value,)) 1460 | elif field_type == self.TYPE_DOUBLE: 1461 | self._writeStruct(">d", 1, (value,)) 1462 | elif field_type == self.TYPE_OBJECT or field_type == self.TYPE_ARRAY: 1463 | if value is None: 1464 | self.write_null() 1465 | elif isinstance(value, JavaEnum): 1466 | self.write_enum(value) 1467 | elif isinstance(value, JavaArray): 1468 | self.write_array(value) 1469 | elif isinstance(value, JavaObject): 1470 | self.write_object(value) 1471 | elif isinstance(value, JavaString): 1472 | self.write_string(value) 1473 | elif isinstance(value, str): 1474 | self.write_blockdata(value) 1475 | else: 1476 | raise RuntimeError("Unknown typecode: {0}".format(field_type)) 1477 | else: 1478 | raise RuntimeError("Unknown typecode: {0}".format(field_type)) 1479 | 1480 | def _convert_type_to_char(self, type_char): 1481 | """ 1482 | Converts the given type code to an int 1483 | 1484 | :param type_char: A type code character 1485 | """ 1486 | typecode = type_char 1487 | if type(type_char) is int: 1488 | typecode = chr(type_char) 1489 | 1490 | if typecode in self.TYPECODES_LIST: 1491 | return ord(typecode) 1492 | elif len(typecode) > 1: 1493 | if typecode[0] == 'L': 1494 | return ord(self.TYPE_OBJECT) 1495 | elif typecode[0] == '[': 1496 | return ord(self.TYPE_ARRAY) 1497 | 1498 | raise RuntimeError("Typecode {0} ({1}) isn't supported." 1499 | .format(type_char, typecode)) 1500 | 1501 | # ------------------------------------------------------------------------------ 1502 | 1503 | 1504 | class DefaultObjectTransformer(object): 1505 | """ 1506 | Default transformer for the deserialized objects. 1507 | Converts JavaObject objects to Python types (maps, lists, ...) 1508 | """ 1509 | class JavaList(list, JavaObject): 1510 | def __init__(self, *args, **kwargs): 1511 | list.__init__(self, *args, **kwargs) 1512 | JavaObject.__init__(self) 1513 | 1514 | class JavaMap(dict, JavaObject): 1515 | def __init__(self, *args, **kwargs): 1516 | dict.__init__(self, *args, **kwargs) 1517 | JavaObject.__init__(self) 1518 | 1519 | def create(self, classdesc): 1520 | """ 1521 | Transforms a deserialized Java object into a Python object 1522 | 1523 | :param java_object: A JavaObject instance 1524 | :return: The Python form of the object, or the original JavaObject 1525 | """ 1526 | 1527 | if classdesc.name in ("java.util.ArrayList", "java.util.LinkedList"): 1528 | # @serialData The length of the array backing the 1529 | # ArrayList instance is emitted (int), 1530 | # followed by all of its elements 1531 | # (each an Object) in the proper order 1532 | log_debug("---") 1533 | log_debug(classdesc.name) 1534 | log_debug("---") 1535 | 1536 | java_object = self.JavaList() 1537 | 1538 | log_debug(">>> java_object: {0}".format(java_object)) 1539 | return java_object 1540 | 1541 | if classdesc.name == "java.util.HashMap": 1542 | log_debug("---") 1543 | log_debug("java.util.HashMap") 1544 | log_debug("---") 1545 | 1546 | java_object = self.JavaMap() 1547 | 1548 | log_debug(">>> java_object: {0}".format(java_object)) 1549 | return java_object 1550 | 1551 | # Return a JavaObject by default 1552 | return JavaObject() 1553 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name="sparkpickle", 6 | description="Provides functions for reading SequenceFile-s with Python " 7 | "pickles.", 8 | version="1.0.1", 9 | license="Apache 2.0", 10 | author="Vadim Markovtsev", 11 | author_email="vadim@sourced.tech", 12 | url="https://github.com/src-d/sparkpickle", 13 | download_url='https://github.com/src-d/sparkpickle', 14 | packages=["sparkpickle"], 15 | package_dir={"sparkpickle": "."}, 16 | exclude=["test.py"], 17 | keywords=["spark", "pyspark", "hadoop", "rdd", "pickle"], 18 | install_requires=[], 19 | package_data={"": ["LICENSE", "README.md"]}, 20 | classifiers=[ 21 | "Development Status :: 3 - Alpha", 22 | "Environment :: Console", 23 | "Intended Audience :: Developers", 24 | "License :: OSI Approved :: Apache Software License", 25 | "Operating System :: POSIX", 26 | "Programming Language :: Python :: 2.7", 27 | "Programming Language :: Python :: 3.2", 28 | "Programming Language :: Python :: 3.3", 29 | "Programming Language :: Python :: 3.4", 30 | "Topic :: Software Development :: Libraries" 31 | ] 32 | ) -------------------------------------------------------------------------------- /test.2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/test.2.bin -------------------------------------------------------------------------------- /test.3.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/test.3.bin -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | import sparkpickle 6 | 7 | 8 | class SparkPickleTests(unittest.TestCase): 9 | def test_load(self): 10 | with open(os.path.join(os.path.dirname(__file__), 11 | "test.%d.bin" % sys.version_info[0]), 12 | "rb") as fin: 13 | objs = sparkpickle.load(fin) 14 | self.assertEqual(objs, list(range(200))) 15 | 16 | if __name__ == "__main__": 17 | unittest.main() 18 | --------------------------------------------------------------------------------