├── .gitignore
├── .idea
└── vcs.xml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── __init__.py
├── __main__.py
├── doc
├── Makefile
├── _static
│ └── .gitignore
├── conf.py
├── index.rst
└── make.bat
├── javaobj.py
├── setup.py
├── test.2.bin
├── test.3.bin
└── test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: cpp
2 | dist: trusty
3 | sudo: required
4 |
5 | before_script:
6 | - pip3 install .
7 | - pip install .
8 |
9 | script:
10 | - python3 test.py
11 | - python test.py
12 |
13 | notifications:
14 | email: false
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
204 | =======================================================================
205 | Apache Spark Subcomponents:
206 |
207 | The Apache Spark project contains subcomponents with separate copyright
208 | notices and license terms. Your use of the source code for the these
209 | subcomponents is subject to the terms and conditions of the following
210 | licenses.
211 |
212 |
213 | ========================================================================
214 | For heapq (pyspark/heapq3.py):
215 | ========================================================================
216 |
217 | See license/LICENSE-heapq.txt
218 |
219 | ========================================================================
220 | For SnapTree:
221 | ========================================================================
222 |
223 | See license/LICENSE-SnapTree.txt
224 |
225 | ========================================================================
226 | For jbcrypt:
227 | ========================================================================
228 |
229 | See license/LICENSE-jbcrypt.txt
230 |
231 | ========================================================================
232 | BSD-style licenses
233 | ========================================================================
234 |
235 | The following components are provided under a BSD-style license. See project link for details.
236 | The text of each license is also included at licenses/LICENSE-[project].txt.
237 |
238 | (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
239 | (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model)
240 | (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
241 | (BSD License) ANTLR 4.5.2-1 (org.antlr:antlr4:4.5.2-1 - http://wwww.antlr.org/)
242 | (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
243 | (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
244 | (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
245 | (BSD) JLine (jline:jline:0.9.94 - http://jline.sourceforge.net)
246 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.3 - http://paranamer.codehaus.org/paranamer)
247 | (BSD) ParaNamer Core (com.thoughtworks.paranamer:paranamer:2.6 - http://paranamer.codehaus.org/paranamer)
248 | (BSD 3 Clause) Scala (http://www.scala-lang.org/download/#License)
249 | (Interpreter classes (all .scala files in repl/src/main/scala
250 | except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
251 | and for SerializableMapWrapper in JavaUtils.scala)
252 | (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.7 - http://www.scala-lang.org/)
253 | (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.7 - http://www.scala-lang.org/)
254 | (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.7 - http://www.scala-lang.org/)
255 | (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.7 - http://www.scala-lang.org/)
256 | (BSD-like) Scalap (org.scala-lang:scalap:2.11.7 - http://www.scala-lang.org/)
257 | (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
258 | (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
259 | (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
260 | (New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo)
261 | (New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog)
262 | (New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf)
263 | (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
264 | (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
265 | (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
266 | (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
267 | (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
268 | (BSD licence) sbt and sbt-launch-lib.bash
269 | (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
270 | (BSD 3 Clause) DPark (https://github.com/douban/dpark/blob/master/LICENSE)
271 | (BSD 3 Clause) CloudPickle (https://github.com/cloudpipe/cloudpickle/blob/master/LICENSE)
272 |
273 | ========================================================================
274 | MIT licenses
275 | ========================================================================
276 |
277 | The following components are provided under the MIT License. See project link for details.
278 | The text of each license is also included at licenses/LICENSE-[project].txt.
279 |
280 | (MIT License) JCL 1.1.1 implemented over SLF4J (org.slf4j:jcl-over-slf4j:1.7.5 - http://www.slf4j.org)
281 | (MIT License) JUL to SLF4J bridge (org.slf4j:jul-to-slf4j:1.7.5 - http://www.slf4j.org)
282 | (MIT License) SLF4J API Module (org.slf4j:slf4j-api:1.7.5 - http://www.slf4j.org)
283 | (MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org)
284 | (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
285 | (MIT License) scopt (com.github.scopt:scopt_2.11:3.2.0 - https://github.com/scopt/scopt)
286 | (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
287 | (MIT License) jquery (https://jquery.org/license/)
288 | (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)
289 | (MIT License) graphlib-dot (https://github.com/cpettitt/graphlib-dot)
290 | (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3)
291 | (MIT License) sorttable (https://github.com/stuartlangridge/sorttable)
292 | (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE)
293 | (MIT License) datatables (http://datatables.net/license)
294 | (MIT License) mustache (https://github.com/mustache/mustache/blob/master/LICENSE)
295 | (MIT License) cookies (http://code.google.com/p/cookies/wiki/License)
296 | (MIT License) blockUI (http://jquery.malsup.com/block/)
297 | (MIT License) RowsGroup (http://datatables.net/license/mit)
298 | (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
299 | (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
300 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude ./test.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/src-d/sparkpickle) [](https://pypi.python.org/pypi/sparkpickle)
2 |
3 | SparkPickle
4 | ===========
5 |
6 | Pure Python implementation of reading SequenceFile-s with pickles written by
7 | Spark's [saveAsPickleFile()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.saveAsPickleFile).
8 | This is needed if you store the results from Spark in the efficient binary pickle
9 | format and want to load them locally on your computer, without any Spark installation,
10 | given only the actual files.
11 |
12 | [Article about creating this project.](https://blog.sourced.tech/post/reading_pyspark_pickles_locally)
13 |
14 | Installation
15 | ------------
16 | ```
17 | pip install sparkpickle
18 | ```
19 | Supports Python 2.7 and 3.x.
20 |
21 | Usage
22 | -----
23 | View the contents of the file via command line:
24 | ```
25 | python -m sparkpickle /path/to/file
26 | ```
27 |
28 | Code:
29 | ```python
30 | import sparkpickle
31 |
32 | for obj in sparkpickle.load_gen("/path/to/file"):
33 | print(obj)
34 | ```
35 |
36 | API
37 | ---
38 | There are 3 functions: `load()`, `loads()` and `load_gen()`. The first two
39 | are similar to those found in "pickle" package, whereas the last one is the
40 | generator which yields deserialized objects and thus provides the minimal
41 | memory footprint.
42 |
43 | License
44 | -------
45 | Apache 2.0.
46 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Provides functions for reading `SequenceFile `_-s
3 | with Python pickles. Such files are usually created with
4 | :meth:`pyspark.rdd.RDD.saveAsPickleFile()`.
5 | No PySpark installation is required, no external dependencies.
6 |
7 | References:
8 | https://blog.sourced.tech/post/reading_pyspark_pickles_locally
9 | https://wiki.apache.org/hadoop/SequenceFile
10 | http://grepcode.com/file/repo1.maven.org/maven2/org.apache.hadoop/hadoop-common/2.7.1/org/apache/hadoop/io/SequenceFile.java#SequenceFile
11 | https://www.safaribooksonline.com/library/view/hadoop-the-definitive/9781449328917/ch04.html#id3960971
12 | https://docs.oracle.com/javase/7/docs/platform/serialization/spec/protocol.html#10258
13 | http://www.javaworld.com/article/2072752/the-java-serialization-algorithm-revealed.html
14 |
15 | :authors: Vadim Markovtsev
16 | :version: 1.0
17 | :status: Alpha
18 | :license: Apache License 2.0
19 |
20 | .. code-block:: none
21 |
22 | Copyright 2016 source{d}
23 |
24 | Licensed under the Apache License, Version 2.0 (the "License");
25 | you may not use this file except in compliance with the License.
26 | You may obtain a copy of the License at
27 |
28 | http://www.apache.org/licenses/LICENSE-2.0
29 |
30 | Unless required by applicable law or agreed to in writing, software
31 | distributed under the License is distributed on an "AS IS" BASIS,
32 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33 | See the License for the specific language governing permissions and
34 | limitations under the License.
35 | """
36 |
37 | from io import BytesIO
38 | import pickle
39 | import struct
40 |
41 | from .javaobj import load as javaobj_load
42 |
43 | __all__ = ("load", "load_gen", "loads", "FormatError")
44 |
45 |
46 | HEADER = b"\x53\x45\x51\x06\x21\x6F\x72\x67\x2E\x61\x70\x61\x63\x68\x65\x2E" \
47 | b"\x68\x61\x64\x6F\x6F\x70\x2E\x69\x6F\x2E\x4E\x75\x6C\x6C\x57\x72" \
48 | b"\x69\x74\x61\x62\x6C\x65\x22\x6F\x72\x67\x2E\x61\x70\x61\x63\x68" \
49 | b"\x65\x2E\x68\x61\x64\x6F\x6F\x70\x2E\x69\x6F\x2E\x42\x79\x74\x65" \
50 | b"\x73\x57\x72\x69\x74\x61\x62\x6C\x65\x00\x00\x00\x00\x00\x00"
51 |
52 |
53 | class FormatError(Exception):
54 | """
55 | Represents any errors related to sparkpickle.
56 | """
57 | pass
58 |
59 |
60 | def load_gen(file, progress_callback=None):
61 | """
62 | Loads all the objects from the specified Spark SequenceFile with pickles
63 | (generator version of load()).
64 | The file is expected to be created with saveAsPickleFile() in PySpark.
65 | All the imported Python classes must be present in the current environment.
66 |
67 | :param file: `File object `_ \
68 | which is open in binary mode ("rb") and must be able to \
69 | read(), seek() and tell().
70 | :param progress_callback: Optional :func:`callable` to report the loading \
71 | progress. It must accept a single argument which is the current \
72 | file position.
73 | :return: The generator object. Every object is yield-ed while reading.
74 | :raises FormatError: something is wrong with the supplied binary file.
75 |
76 | Example::
77 |
78 | with open("/path/to/file", "rb") as f:
79 | for obj in sparkpickle.load_gen(f):
80 | print(obj)
81 | """
82 | header = file.read(len(HEADER))
83 | if header != HEADER:
84 | raise FormatError("Header validation failed.")
85 | mark = file.read(16) # sync mark
86 | record_flag = None
87 | while True:
88 | if record_flag is None and not file.read(4):
89 | break
90 | record_flag = None
91 | if file.read(4) != b"\x00\x00\x00\x00":
92 | raise FormatError("Record validation failed.")
93 | object_size = file.read(4)
94 | try:
95 | object_size = struct.unpack(">I", object_size)[0]
96 | except ValueError:
97 | raise FormatError("Failed to parse BytesWritable.")
98 | object_start_pos = file.tell()
99 | batches = []
100 |
101 | def callback(_, size):
102 | pos = file.tell()
103 | batches.append(pickle.load(file))
104 | if file.tell() - pos != size:
105 | raise FormatError("Object stream parsing integrity error.")
106 | if progress_callback is not None:
107 | progress_callback(pos + size)
108 |
109 | javaobj_load(file, ignore_remaining_data=True, bytes_callback=callback)
110 | if file.tell() - object_start_pos != object_size:
111 | raise FormatError("Object stream parsing integrity error.")
112 | for batch in batches:
113 | for obj in batch:
114 | yield obj
115 | del batches[:]
116 | probe = file.read(4)
117 | if probe == b"\xFF\xFF\xFF\xFF":
118 | if file.read(16) != mark:
119 | raise FormatError("Object stream parsing integrity error.")
120 | elif not probe:
121 | break
122 | else:
123 | record_flag = probe
124 |
125 |
126 | def load(file, progress_callback=None):
127 | """
128 | Loads all the objects from the specified Spark `SequenceFile `_
129 | with pickles. The file is expected to be created with
130 | :meth:`pyspark.RDD.saveAsPickleFile()` in PySpark.
131 | All the imported Python classes must be present in the current environment.
132 |
133 | :param file: `File object `_ \
134 | which is open in binary mode ("rb") and must be able to \
135 | read(), seek() and tell().
136 | :param progress_callback: Optional :func:`callable` to report the loading\
137 | progress. It must accept a single argument which is the current \
138 | file position.
139 | :return: The list with the loaded objects. Internal batches are flattened.
140 | """
141 | return list(load_gen(file, progress_callback=progress_callback))
142 |
143 |
144 | def loads(buffer, progress_callback=None):
145 | """
146 | Loads all the objects from the specified Spark `SequenceFile `_
147 | with pickles. The file is expected to be created with
148 | :meth:`pyspark.RDD.saveAsPickleFile()` in PySpark.
149 | All the imported Python classes must be present in the current environment.
150 |
151 | :param buffer: The contents of the file.
152 | :type buffer: bytes
153 | :param progress_callback: Optional :func:`callable` to report the loading \
154 | progress. It must accept a single argument which is the current \
155 | file position.
156 | :return: The list with the loaded objects. Internal batches are flattened.
157 | """
158 | return load(BytesIO(buffer), progress_callback=progress_callback)
159 |
--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple CL application which prints the contents inside a Spark Pickle RDD.
3 | """
4 |
5 | from __future__ import print_function
6 | import sys
7 |
8 | import sparkpickle
9 |
10 |
11 | def main():
12 | with open(sys.argv[1], "rb") as fin:
13 | i = 0
14 | t = None
15 | for obj in sparkpickle.load_gen(fin):
16 | t = type(obj)
17 | print(obj)
18 | i += 1
19 | print("-" * 80)
20 | print("Overall: %d objects of type %s" % (i, t))
21 |
22 | if __name__ == "__main__":
23 | sys.exit(main())
24 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 | # You can set these variables from the command line.
4 | SPHINXOPTS ?=
5 | SPHINXBUILD ?= python3 -msphinx
6 | SPHINXPROJ ?= wmd-relax
7 | SOURCEDIR ?= .
8 | BUILDDIR ?= _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/doc/_static/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/doc/_static/.gitignore
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # sparkpickle documentation build configuration file, created by
4 | # sphinx-quickstart on Mon Jun 5 16:52:34 2017.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | #
19 | import os
20 | import sys
21 | sys.path.insert(0, os.path.abspath('..'))
22 | sys.path.insert(0, os.path.abspath('../..'))
23 |
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ['sphinx.ext.autodoc',
35 | 'sphinx.ext.mathjax',
36 | 'sphinx.ext.viewcode',
37 | 'sphinx.ext.intersphinx']
38 |
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 |
42 | # The suffix(es) of source filenames.
43 | # You can specify multiple suffix as a list of string:
44 | #
45 | # source_suffix = ['.rst', '.md']
46 | source_suffix = '.rst'
47 |
48 | # The master toctree document.
49 | master_doc = 'index'
50 |
51 | # General information about the project.
52 | project = u'sparkpickle'
53 | copyright = u'2017, Vadim Markovtsev'
54 | author = u'Vadim Markovtsev'
55 |
56 | # The version info for the project you're documenting, acts as replacement for
57 | # |version| and |release|, also used in various other places throughout the
58 | # built documents.
59 | #
60 | # The short X.Y version.
61 | version = u'master'
62 | # The full version, including alpha/beta/rc tags.
63 | release = u'master'
64 |
65 | # The language for content autogenerated by Sphinx. Refer to documentation
66 | # for a list of supported languages.
67 | #
68 | # This is also used if you do content translation via gettext catalogs.
69 | # Usually you set "language" from the command line for these cases.
70 | language = None
71 |
72 | # List of patterns, relative to source directory, that match files and
73 | # directories to ignore when looking for source files.
74 | # This patterns also effect to html_static_path and html_extra_path
75 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
76 |
77 | # The name of the Pygments (syntax highlighting) style to use.
78 | pygments_style = 'sphinx'
79 |
80 | # If true, `todo` and `todoList` produce output, else they produce nothing.
81 | todo_include_todos = False
82 |
83 |
84 | # -- Options for HTML output ----------------------------------------------
85 |
86 | # The theme to use for HTML and HTML Help pages. See the documentation for
87 | # a list of builtin themes.
88 | #
89 | html_theme = 'alabaster'
90 |
91 | # Theme options are theme-specific and customize the look and feel of a theme
92 | # further. For a list of options available for each theme, see the
93 | # documentation.
94 | #
95 | # html_theme_options = {}
96 |
97 | # Add any paths that contain custom static files (such as style sheets) here,
98 | # relative to this directory. They are copied after the builtin static files,
99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = ['_static']
101 |
102 |
103 | # -- Options for HTMLHelp output ------------------------------------------
104 |
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'sparkpickledoc'
107 |
108 |
109 | # -- Options for LaTeX output ---------------------------------------------
110 |
111 | latex_elements = {
112 | # The paper size ('letterpaper' or 'a4paper').
113 | #
114 | # 'papersize': 'letterpaper',
115 |
116 | # The font size ('10pt', '11pt' or '12pt').
117 | #
118 | # 'pointsize': '10pt',
119 |
120 | # Additional stuff for the LaTeX preamble.
121 | #
122 | # 'preamble': '',
123 |
124 | # Latex figure (float) alignment
125 | #
126 | # 'figure_align': 'htbp',
127 | }
128 |
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | # author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 | (master_doc, 'sparkpickle.tex', u'sparkpickle Documentation',
134 | u'Vadim Markovtsev', 'manual'),
135 | ]
136 |
137 |
138 | # -- Options for manual page output ---------------------------------------
139 |
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 | (master_doc, 'sparkpickle', u'sparkpickle Documentation',
144 | [author], 1)
145 | ]
146 |
147 |
148 | # -- Options for Texinfo output -------------------------------------------
149 |
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | # dir menu entry, description, category)
153 | texinfo_documents = [
154 | (master_doc, 'sparkpickle', u'sparkpickle Documentation',
155 | author, 'sparkpickle', 'One line description of project.',
156 | 'Miscellaneous'),
157 | ]
158 |
159 | autodoc_default_flags = ['members', 'undoc-members', 'show-inheritance']
160 |
161 | intersphinx_mapping = {
162 | 'python': ('https://docs.python.org/3.6', None),
163 | 'spark': ('http://spark.apache.org/docs/latest/api/python/', None)}
164 |
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | .. wmd-relax documentation master file, created by
2 | sphinx-quickstart on Mon Jun 5 16:52:34 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | sparkpickle's documentation
7 | ===========================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | .. automodule:: sparkpickle
14 | :members:
15 |
16 |
17 | Indices and tables
18 | ==================
19 |
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=python -msphinx
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=wmd-relax
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | echo.then set the SPHINXBUILD environment variable to point to the full
21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | echo.Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/javaobj.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -- Content-Encoding: UTF-8 --
3 | """
4 | Provides functions for reading and writing (writing is WIP currently) Java
5 | objects serialized or will be deserialized by ObjectOutputStream. This form of
6 | object representation is a standard data interchange format in Java world.
7 |
8 | javaobj module exposes an API familiar to users of the standard library
9 | marshal, pickle and json modules.
10 |
11 | See:
12 | http://download.oracle.com/javase/6/docs/platform/serialization/spec/protocol.html
13 |
14 | :authors: Volodymyr Buell, Thomas Calmant
15 | :license: Apache License 2.0
16 | :version: 0.2.2
17 | :status: Alpha
18 |
19 | ..
20 |
21 | Copyright 2016 Thomas Calmant
22 |
23 | Licensed under the Apache License, Version 2.0 (the "License");
24 | you may not use this file except in compliance with the License.
25 | You may obtain a copy of the License at
26 |
27 | http://www.apache.org/licenses/LICENSE-2.0
28 |
29 | Unless required by applicable law or agreed to in writing, software
30 | distributed under the License is distributed on an "AS IS" BASIS,
31 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32 | See the License for the specific language governing permissions and
33 | limitations under the License.
34 | """
35 |
36 | # Standard library
37 | import collections
38 | import logging
39 | import os
40 | import struct
41 | import sys
42 |
43 | try:
44 | # Python 2
45 | from StringIO import StringIO as BytesIO
46 | except ImportError:
47 | # Python 3+
48 | from io import BytesIO
49 |
50 | # ------------------------------------------------------------------------------
51 |
52 | # Module version
53 | __version_info__ = (0, 2, 2)
54 | __version__ = ".".join(str(x) for x in __version_info__)
55 |
56 | # Documentation strings format
57 | __docformat__ = "restructuredtext en"
58 |
59 | # ------------------------------------------------------------------------------
60 |
61 | # Setup the logger
62 | _log = logging.getLogger(__name__)
63 |
64 |
65 | def log_debug(message, ident=0):
66 | """
67 | Logs a message at debug level
68 |
69 | :param message: Message to log
70 | :param ident: Number of indentation spaces
71 | """
72 | _log.debug(" " * (ident * 2) + str(message))
73 |
74 |
75 | def log_error(message, ident=0):
76 | """
77 | Logs a message at error level
78 |
79 | :param message: Message to log
80 | :param ident: Number of indentation spaces
81 | """
82 | _log.error(" " * (ident * 2) + str(message))
83 |
84 | # ------------------------------------------------------------------------------
85 |
86 | if sys.version_info[0] >= 3:
87 | # Python 3 interpreter : bytes & str
88 | def to_bytes(data, encoding="UTF-8"):
89 | """
90 | Converts the given string to an array of bytes.
91 | Returns the first parameter if it is already an array of bytes.
92 |
93 | :param data: A unicode string
94 | :param encoding: The encoding of data
95 | :return: The corresponding array of bytes
96 | """
97 | if type(data) is bytes:
98 | # Nothing to do
99 | return data
100 | return data.encode(encoding)
101 |
102 | def to_str(data, encoding="UTF-8"):
103 | """
104 | Converts the given parameter to a string.
105 | Returns the first parameter if it is already an instance of ``str``.
106 |
107 | :param data: A string
108 | :param encoding: The encoding of data
109 | :return: The corresponding string
110 | """
111 | if type(data) is str:
112 | # Nothing to do
113 | return data
114 | return str(data, encoding)
115 |
116 | def read_to_str(data):
117 | """
118 | Concats all bytes into a string
119 | """
120 | return ''.join(chr(char) for char in data)
121 |
122 | else:
123 | # Python 2 interpreter : str & unicode
124 | def to_str(data, encoding="UTF-8"):
125 | """
126 | Converts the given parameter to a string.
127 | Returns the first parameter if it is already an instance of ``str``.
128 |
129 | :param data: A string
130 | :param encoding: The encoding of data
131 | :return: The corresponding string
132 | """
133 | if type(data) is str:
134 | # Nothing to do
135 | return data
136 | return data.encode(encoding)
137 |
138 | # Same operation
139 | to_bytes = to_str
140 |
141 | def read_to_str(data):
142 | """
143 | Nothing to do in Python 2
144 | """
145 | return data
146 |
147 | # ------------------------------------------------------------------------------
148 |
149 |
150 | def load(file_object, *transformers, **kwargs):
151 | """
152 | Deserializes Java primitive data and objects serialized using
153 | ObjectOutputStream from a file-like object.
154 |
155 | :param file_object: A file-like object
156 | :param transformers: Custom transformers to use
157 | :param ignore_remaining_data: If True, don't log an error when unused
158 | trailing bytes are remaining
159 | :return: The deserialized object
160 | """
161 | # Read keyword argument
162 | ignore_remaining_data = kwargs.get('ignore_remaining_data', False)
163 |
164 | marshaller = JavaObjectUnmarshaller(
165 | file_object, kwargs.get('bytes_callback'))
166 |
167 | # Add custom transformers first
168 | for transformer in transformers:
169 | marshaller.add_transformer(transformer)
170 | marshaller.add_transformer(DefaultObjectTransformer())
171 |
172 | # Read the file object
173 | return marshaller.readObject(ignore_remaining_data=ignore_remaining_data)
174 |
175 |
176 | def loads(string, *transformers, **kwargs):
177 | """
178 | Deserializes Java objects and primitive data serialized using
179 | ObjectOutputStream from a string.
180 |
181 | :param string: A Java data string
182 | :param transformers: Custom transformers to use
183 | :param ignore_remaining_data: If True, don't log an error when unused
184 | trailing bytes are remaining
185 | :return: The deserialized object
186 | """
187 | # Read keyword argument
188 | ignore_remaining_data = kwargs.get('ignore_remaining_data', False)
189 |
190 | # Reuse the load method (avoid code duplication)
191 | return load(BytesIO(string), *transformers,
192 | ignore_remaining_data=ignore_remaining_data)
193 |
194 |
195 | def dumps(obj, *transformers):
196 | """
197 | Serializes Java primitive data and objects unmarshaled by load(s) before
198 | into string.
199 |
200 | :param obj: A Python primitive object, or one loaded using load(s)
201 | :param transformers: Custom transformers to use
202 | :return: The serialized data as a string
203 | """
204 | marshaller = JavaObjectMarshaller()
205 | # Add custom transformers
206 | for transformer in transformers:
207 | marshaller.add_transformer(transformer)
208 |
209 | return marshaller.dump(obj)
210 |
211 | # ------------------------------------------------------------------------------
212 |
213 |
214 | class JavaClass(object):
215 | """
216 | Represents a class in the Java world
217 | """
218 | def __init__(self):
219 | """
220 | Sets up members
221 | """
222 | self.name = None
223 | self.serialVersionUID = None
224 | self.flags = None
225 | self.fields_names = []
226 | self.fields_types = []
227 | self.superclass = None
228 |
229 | def __str__(self):
230 | """
231 | String representation of the Java class
232 | """
233 | return self.__repr__()
234 |
235 | def __repr__(self):
236 | """
237 | String representation of the Java class
238 | """
239 | return "[{0:s}:0x{1:X}]".format(self.name, self.serialVersionUID)
240 |
241 | def __eq__(self, other):
242 | """
243 | Equality test between two Java classes
244 |
245 | :param other: Other JavaClass to test
246 | :return: True if both classes share the same fields and name
247 | """
248 | if not isinstance(other, type(self)):
249 | return False
250 |
251 | return (self.name == other.name and
252 | self.serialVersionUID == other.serialVersionUID and
253 | self.flags == other.flags and
254 | self.fields_names == other.fields_names and
255 | self.fields_types == other.fields_types and
256 | self.superclass == other.superclass)
257 |
258 |
259 | class JavaObject(object):
260 | """
261 | Represents a deserialized non-primitive Java object
262 | """
263 | def __init__(self):
264 | """
265 | Sets up members
266 | """
267 | self.classdesc = None
268 | self.annotations = []
269 |
270 | def get_class(self):
271 | """
272 | Returns the JavaClass that defines the type of this object
273 | """
274 | return self.classdesc
275 |
276 | def __str__(self):
277 | """
278 | String representation
279 | """
280 | return self.__repr__()
281 |
282 | def __repr__(self):
283 | """
284 | String representation
285 | """
286 | name = "UNKNOWN"
287 | if self.classdesc:
288 | name = self.classdesc.name
289 | return "".format(name)
290 |
291 | def __eq__(self, other):
292 | """
293 | Equality test between two Java classes
294 |
295 | :param other: Other JavaClass to test
296 | :return: True if both classes share the same fields and name
297 | """
298 | if not isinstance(other, type(self)):
299 | return False
300 |
301 | res = (self.classdesc == other.classdesc and
302 | self.annotations == other.annotations)
303 | if not res:
304 | return False
305 |
306 | for name in self.classdesc.fields_names:
307 | if not (getattr(self, name) == getattr(other, name)):
308 | return False
309 | return True
310 |
311 |
312 | class JavaString(str):
313 | """
314 | Represents a Java String
315 | """
316 | def __hash__(self):
317 | return str.__hash__(self)
318 |
319 | def __eq__(self, other):
320 | if not isinstance(other, str):
321 | return False
322 | return str.__eq__(self, other)
323 |
324 |
325 | class JavaEnum(JavaObject):
326 | """
327 | Represents a Java enumeration
328 | """
329 | def __init__(self, constant=None):
330 | super(JavaEnum, self).__init__()
331 | self.constant = constant
332 |
333 |
334 | class JavaArray(list, JavaObject):
335 | """
336 | Represents a Java Array
337 | """
338 | def __init__(self, classdesc=None):
339 | list.__init__(self)
340 | JavaObject.__init__(self)
341 | self.classdesc = classdesc
342 |
343 | # ------------------------------------------------------------------------------
344 |
345 |
346 | class JavaObjectConstants(object):
347 | """
348 | Defines the constants of the Java serialization format
349 | """
350 | STREAM_MAGIC = 0xaced
351 | STREAM_VERSION = 0x05
352 |
353 | TC_NULL = 0x70
354 | TC_REFERENCE = 0x71
355 | TC_CLASSDESC = 0x72
356 | TC_OBJECT = 0x73
357 | TC_STRING = 0x74
358 | TC_ARRAY = 0x75
359 | TC_CLASS = 0x76
360 | TC_BLOCKDATA = 0x77
361 | TC_ENDBLOCKDATA = 0x78
362 | TC_RESET = 0x79
363 | TC_BLOCKDATALONG = 0x7A
364 | TC_EXCEPTION = 0x7B
365 | TC_LONGSTRING = 0x7C
366 | TC_PROXYCLASSDESC = 0x7D
367 | TC_ENUM = 0x7E
368 | # Ignore TC_MAX: we don't use it and it messes with TC_ENUM
369 | # TC_MAX = 0x7E
370 |
371 | # classDescFlags
372 | SC_WRITE_METHOD = 0x01 # if SC_SERIALIZABLE
373 | SC_BLOCK_DATA = 0x08 # if SC_EXTERNALIZABLE
374 | SC_SERIALIZABLE = 0x02
375 | SC_EXTERNALIZABLE = 0x04
376 | SC_ENUM = 0x10
377 |
378 | # type definition chars (typecode)
379 | TYPE_BYTE = 'B' # 0x42
380 | TYPE_CHAR = 'C' # 0x43
381 | TYPE_DOUBLE = 'D' # 0x44
382 | TYPE_FLOAT = 'F' # 0x46
383 | TYPE_INTEGER = 'I' # 0x49
384 | TYPE_LONG = 'J' # 0x4A
385 | TYPE_SHORT = 'S' # 0x53
386 | TYPE_BOOLEAN = 'Z' # 0x5A
387 | TYPE_OBJECT = 'L' # 0x4C
388 | TYPE_ARRAY = '[' # 0x5B
389 |
390 | # list of supported typecodes listed above
391 | TYPECODES_LIST = [
392 | # primitive types
393 | TYPE_BYTE,
394 | TYPE_CHAR,
395 | TYPE_DOUBLE,
396 | TYPE_FLOAT,
397 | TYPE_INTEGER,
398 | TYPE_LONG,
399 | TYPE_SHORT,
400 | TYPE_BOOLEAN,
401 | # object types
402 | TYPE_OBJECT,
403 | TYPE_ARRAY]
404 |
405 | BASE_REFERENCE_IDX = 0x7E0000
406 |
407 |
408 | class OpCodeDebug(object):
409 | # Type codes
410 | OP_CODE = dict((getattr(JavaObjectConstants, key), key)
411 | for key in dir(JavaObjectConstants)
412 | if key.startswith("TC_"))
413 |
414 | TYPE = dict((getattr(JavaObjectConstants, key), key)
415 | for key in dir(JavaObjectConstants)
416 | if key.startswith("TYPE_"))
417 |
418 | STREAM_CONSTANT = dict((getattr(JavaObjectConstants, key), key)
419 | for key in dir(JavaObjectConstants)
420 | if key.startswith("SC_"))
421 |
422 | @staticmethod
423 | def op_id(op_id):
424 | return OpCodeDebug.OP_CODE.get(
425 | op_id, "".format(op_id))
426 |
427 | @staticmethod
428 | def type_code(type_id):
429 | return OpCodeDebug.TYPE.get(
430 | type_id, "".format(type_id))
431 |
432 | @staticmethod
433 | def flags(flags):
434 | names = sorted(
435 | descr for key, descr in OpCodeDebug.STREAM_CONSTANT.items()
436 | if key & flags)
437 | return ', '.join(names)
438 |
439 |
440 | # ------------------------------------------------------------------------------
441 |
442 |
443 | class JavaObjectUnmarshaller(JavaObjectConstants):
444 | """
445 | Deserializes a Java serialization stream
446 | """
447 | def __init__(self, stream, bytes_callback=None):
448 | """
449 | Sets up members
450 |
451 | :param stream: An input stream (opened in binary/bytes mode)
452 | :raise IOError: Invalid input stream
453 | """
454 | self.bytes_callback = bytes_callback
455 |
456 | # Check stream
457 | if stream is None:
458 | raise IOError("No input stream given")
459 |
460 | # Prepare the association Terminal Symbol -> Reading method
461 | self.opmap = {
462 | self.TC_NULL: self.do_null,
463 | self.TC_CLASSDESC: self.do_classdesc,
464 | self.TC_OBJECT: self.do_object,
465 | self.TC_STRING: self.do_string,
466 | self.TC_LONGSTRING: self.do_string_long,
467 | self.TC_ARRAY: self.do_array,
468 | self.TC_CLASS: self.do_class,
469 | self.TC_BLOCKDATA: self.do_blockdata,
470 | self.TC_BLOCKDATALONG: self.do_blockdata_long,
471 | self.TC_REFERENCE: self.do_reference,
472 | self.TC_ENUM: self.do_enum,
473 | # note that we are reusing do_null:
474 | self.TC_ENDBLOCKDATA: self.do_null,
475 | }
476 |
477 | # Set up members
478 | self.current_object = None
479 | self.reference_counter = 0
480 | self.references = []
481 | self.object_transformers = []
482 | self.object_stream = stream
483 |
484 | # Read the stream header (magic & version)
485 | self._readStreamHeader()
486 |
487 | def readObject(self, ignore_remaining_data=False):
488 | """
489 | Reads an object from the input stream
490 |
491 | :param ignore_remaining_data: If True, don't log an error when
492 | unused trailing bytes are remaining
493 | :return: The unmarshalled object
494 | :raise Exception: Any exception that occurred during unmarshalling
495 | """
496 | try:
497 | # TODO: add expects
498 | _, res = self._read_and_exec_opcode(ident=0)
499 | log_debug("Java Object unmarshalled successfully!")
500 | return res
501 | except Exception:
502 | self._oops_dump_state(ignore_remaining_data)
503 | raise
504 |
505 | def add_transformer(self, transformer):
506 | """
507 | Appends an object transformer to the deserialization process
508 |
509 | :param transformer: An object with a transform(obj) method
510 | """
511 | self.object_transformers.append(transformer)
512 |
513 | def _readStreamHeader(self):
514 | """
515 | Reads the magic header of a Java serialization stream
516 |
517 | :raise IOError: Invalid magic header (not a Java stream)
518 | """
519 | (magic, version) = self._readStruct(">HH")
520 | if magic != self.STREAM_MAGIC or version != self.STREAM_VERSION:
521 | raise IOError("The stream is not java serialized object. "
522 | "Invalid stream header: {0:04X}{1:04X}"
523 | .format(magic, version))
524 |
525 | def _read_and_exec_opcode(self, ident=0, expect=None):
526 | """
527 | Reads the next opcode, and executes its handler
528 |
529 | :param ident: Log identation level
530 | :param expect: A list of expected opcodes
531 | :return: A tuple: (opcode, result of the handler)
532 | :raise IOError: Read opcode is not one of the expected ones
533 | :raise RuntimeError: Unknown opcode
534 | """
535 | position = self.object_stream.tell()
536 | (opid,) = self._readStruct(">B")
537 | log_debug("OpCode: 0x{0:X} -- {1} (at offset 0x{2:X})"
538 | .format(opid, OpCodeDebug.op_id(opid), position), ident)
539 |
540 | if expect and opid not in expect:
541 | raise IOError(
542 | "Unexpected opcode 0x{0:X} -- {1} (at offset 0x{2:X})"
543 | .format(opid, OpCodeDebug.op_id(opid), position))
544 |
545 | try:
546 | handler = self.opmap[opid]
547 | except KeyError:
548 | raise RuntimeError(
549 | "Unknown OpCode in the stream: 0x{0:X} (at offset 0x{1:X})"
550 | .format(opid, position))
551 | else:
552 | return opid, handler(ident=ident)
553 |
554 | def _readStruct(self, unpack):
555 | """
556 | Reads from the input stream, using struct
557 |
558 | :param unpack: An unpack format string
559 | :return: The result of struct.unpack (tuple)
560 | :raise RuntimeError: End of stream reached during unpacking
561 | """
562 | length = struct.calcsize(unpack)
563 | ba = self.object_stream.read(length)
564 |
565 | if len(ba) != length:
566 | raise RuntimeError("Stream has been ended unexpectedly while "
567 | "unmarshaling.")
568 |
569 | return struct.unpack(unpack, ba)
570 |
571 | def _readString(self, length_fmt="H"):
572 | """
573 | Reads a serialized string
574 |
575 | :param length_fmt: Structure format of the string length (H or Q)
576 | :return: The deserialized string
577 | :raise RuntimeError: Unexpected end of stream
578 | """
579 | (length,) = self._readStruct(">{0}".format(length_fmt))
580 | ba = self.object_stream.read(length)
581 | return to_str(ba)
582 |
583 | def do_classdesc(self, parent=None, ident=0):
584 | """
585 | Handles a TC_CLASSDESC opcode
586 |
587 | :param parent:
588 | :param ident: Log indentation level
589 | :return: A JavaClass object
590 | """
591 | # TC_CLASSDESC className serialVersionUID newHandle classDescInfo
592 | # classDescInfo:
593 | # classDescFlags fields classAnnotation superClassDesc
594 | # classDescFlags:
595 | # (byte) // Defined in Terminal Symbols and Constants
596 | # fields:
597 | # (short) fieldDesc[count]
598 |
599 | # fieldDesc:
600 | # primitiveDesc
601 | # objectDesc
602 | # primitiveDesc:
603 | # prim_typecode fieldName
604 | # objectDesc:
605 | # obj_typecode fieldName className1
606 | clazz = JavaClass()
607 | log_debug("[classdesc]", ident)
608 | class_name = self._readString()
609 | clazz.name = class_name
610 | log_debug("Class name: %s" % class_name, ident)
611 |
612 | # serialVersionUID is a Java (signed) long => 8 bytes
613 | serialVersionUID, classDescFlags = self._readStruct(">qB")
614 | clazz.serialVersionUID = serialVersionUID
615 | clazz.flags = classDescFlags
616 |
617 | self._add_reference(clazz, ident)
618 |
619 | log_debug("Serial: 0x{0:X} / {0:d} - classDescFlags: 0x{1:X} {2}"
620 | .format(serialVersionUID, classDescFlags,
621 | OpCodeDebug.flags(classDescFlags)), ident)
622 | (length,) = self._readStruct(">H")
623 | log_debug("Fields num: 0x{0:X}".format(length), ident)
624 |
625 | clazz.fields_names = []
626 | clazz.fields_types = []
627 | for fieldId in range(length):
628 | (typecode,) = self._readStruct(">B")
629 | field_name = self._readString()
630 | field_type = self._convert_char_to_type(typecode)
631 |
632 | log_debug("> Reading field {0}".format(field_name), ident)
633 |
634 | if field_type == self.TYPE_ARRAY:
635 | _, field_type = self._read_and_exec_opcode(
636 | ident=ident + 1,
637 | expect=(self.TC_STRING, self.TC_REFERENCE))
638 |
639 | if type(field_type) is not JavaString:
640 | raise AssertionError("Field type must be a JavaString, "
641 | "not {0}".format(type(field_type)))
642 |
643 | elif field_type == self.TYPE_OBJECT:
644 | _, field_type = self._read_and_exec_opcode(
645 | ident=ident + 1,
646 | expect=(self.TC_STRING, self.TC_REFERENCE))
647 |
648 | if type(field_type) is JavaClass:
649 | # FIXME: ugly trick
650 | field_type = JavaString(field_type.name)
651 |
652 | if type(field_type) is not JavaString:
653 | raise AssertionError("Field type must be a JavaString, "
654 | "not {0}".format(type(field_type)))
655 |
656 | log_debug("< FieldName: 0x{0:X} Name:{1} Type:{2} ID:{3}"
657 | .format(typecode, field_name, field_type, fieldId),
658 | ident)
659 | assert field_name is not None
660 | assert field_type is not None
661 |
662 | clazz.fields_names.append(field_name)
663 | clazz.fields_types.append(field_type)
664 |
665 | if parent:
666 | parent.__fields = clazz.fields_names
667 | parent.__types = clazz.fields_types
668 |
669 | # classAnnotation
670 | (opid,) = self._readStruct(">B")
671 | log_debug("OpCode: 0x{0:X} -- {1} (classAnnotation)"
672 | .format(opid, OpCodeDebug.op_id(opid)), ident)
673 | if opid != self.TC_ENDBLOCKDATA:
674 | raise NotImplementedError("classAnnotation isn't implemented yet")
675 |
676 | # superClassDesc
677 | log_debug("Reading Super Class of {0}".format(clazz.name), ident)
678 | _, superclassdesc = self._read_and_exec_opcode(
679 | ident=ident + 1,
680 | expect=(self.TC_CLASSDESC, self.TC_NULL, self.TC_REFERENCE))
681 | log_debug("Super Class for {0}: {1}"
682 | .format(clazz.name, str(superclassdesc)), ident)
683 | clazz.superclass = superclassdesc
684 | return clazz
685 |
686 | def do_blockdata(self, parent=None, ident=0):
687 | """
688 | Handles TC_BLOCKDATA opcode
689 |
690 | :param parent:
691 | :param ident: Log indentation level
692 | :return: A string containing the block data
693 | """
694 | # TC_BLOCKDATA (unsigned byte) (byte)[size]
695 | log_debug("[blockdata]", ident)
696 | (length,) = self._readStruct(">B")
697 | ba = self.object_stream.read(length)
698 |
699 | # Ensure we have an str
700 | return read_to_str(ba)
701 |
702 | def do_blockdata_long(self, parent=None, ident=0):
703 | """
704 | Handles TC_BLOCKDATALONG opcode
705 |
706 | :param parent:
707 | :param ident: Log indentation level
708 | :return: A string containing the block data
709 | """
710 | # TC_BLOCKDATALONG (int) (byte)[size]
711 | log_debug("[blockdatalong]", ident)
712 | (length,) = self._readStruct(">I")
713 | ba = self.object_stream.read(length)
714 |
715 | # Ensure we have an str
716 | return read_to_str(ba)
717 |
718 | def do_class(self, parent=None, ident=0):
719 | """
720 | Handles TC_CLASS opcode
721 |
722 | :param parent:
723 | :param ident: Log indentation level
724 | :return: A JavaClass object
725 | """
726 | # TC_CLASS classDesc newHandle
727 | log_debug("[class]", ident)
728 |
729 | # TODO: what to do with "(ClassDesc)prevObject".
730 | # (see 3rd line for classDesc:)
731 | _, classdesc = self._read_and_exec_opcode(
732 | ident=ident + 1,
733 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC,
734 | self.TC_NULL, self.TC_REFERENCE))
735 | log_debug("Classdesc: {0}".format(classdesc), ident)
736 | self._add_reference(classdesc, ident)
737 | return classdesc
738 |
739 | def do_object(self, parent=None, ident=0):
740 | """
741 | Handles a TC_OBJECT opcode
742 |
743 | :param parent:
744 | :param ident: Log indentation level
745 | :return: A JavaClass object
746 | """
747 | # TC_OBJECT classDesc newHandle classdata[] // data for each class
748 | java_object = JavaObject()
749 | log_debug("[object]", ident)
750 | log_debug("java_object.annotations just after instantiation: {0}"
751 | .format(java_object.annotations), ident)
752 |
753 | # TODO: what to do with "(ClassDesc)prevObject".
754 | # (see 3rd line for classDesc:)
755 | opcode, classdesc = self._read_and_exec_opcode(
756 | ident=ident + 1,
757 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC,
758 | self.TC_NULL, self.TC_REFERENCE))
759 | # self.TC_REFERENCE hasn't shown in spec, but actually is here
760 |
761 | # Create object
762 | for transformer in self.object_transformers:
763 | java_object = transformer.create(classdesc)
764 | if java_object:
765 | break
766 |
767 | # Store classdesc of this object
768 | java_object.classdesc = classdesc
769 |
770 | # Store the reference
771 | self._add_reference(java_object, ident)
772 |
773 | # classdata[]
774 |
775 | if classdesc.flags & self.SC_EXTERNALIZABLE \
776 | and not classdesc.flags & self.SC_BLOCK_DATA:
777 | # TODO:
778 | raise NotImplementedError("externalContents isn't implemented yet")
779 |
780 | if classdesc.flags & self.SC_SERIALIZABLE:
781 | # TODO: look at ObjectInputStream.readSerialData()
782 | # FIXME: Handle the SC_WRITE_METHOD flag
783 |
784 | # create megalist
785 | tempclass = classdesc
786 | megalist = []
787 | megatypes = []
788 | log_debug("Constructing class...", ident)
789 | while tempclass:
790 | log_debug("Class: {0}".format(tempclass.name), ident + 1)
791 | class_fields_str = ' - '.join(
792 | ' '.join((field_type, field_name))
793 | for field_type, field_name
794 | in zip(tempclass.fields_types, tempclass.fields_names))
795 | if class_fields_str:
796 | log_debug(class_fields_str, ident + 2)
797 |
798 | fieldscopy = tempclass.fields_names[:]
799 | fieldscopy.extend(megalist)
800 | megalist = fieldscopy
801 |
802 | fieldscopy = tempclass.fields_types[:]
803 | fieldscopy.extend(megatypes)
804 | megatypes = fieldscopy
805 |
806 | tempclass = tempclass.superclass
807 |
808 | log_debug("Values count: {0}".format(len(megalist)), ident)
809 | log_debug("Prepared list of values: {0}".format(megalist), ident)
810 | log_debug("Prepared list of types: {0}".format(megatypes), ident)
811 |
812 | for field_name, field_type in zip(megalist, megatypes):
813 | log_debug("Reading field: {0} - {1}"
814 | .format(field_type, field_name))
815 | res = self._read_value(field_type, ident, name=field_name)
816 | java_object.__setattr__(field_name, res)
817 |
818 | if classdesc.flags & self.SC_SERIALIZABLE \
819 | and classdesc.flags & self.SC_WRITE_METHOD \
820 | or classdesc.flags & self.SC_EXTERNALIZABLE \
821 | and classdesc.flags & self.SC_BLOCK_DATA:
822 | # objectAnnotation
823 | log_debug("java_object.annotations before: {0}"
824 | .format(java_object.annotations), ident)
825 |
826 | while opcode != self.TC_ENDBLOCKDATA:
827 | opcode, obj = self._read_and_exec_opcode(ident=ident + 1)
828 | # , expect=[self.TC_ENDBLOCKDATA, self.TC_BLOCKDATA,
829 | # self.TC_OBJECT, self.TC_NULL, self.TC_REFERENCE])
830 | if opcode != self.TC_ENDBLOCKDATA:
831 | java_object.annotations.append(obj)
832 |
833 | log_debug("objectAnnotation value: {0}".format(obj), ident)
834 |
835 | log_debug("java_object.annotations after: {0}"
836 | .format(java_object.annotations), ident)
837 |
838 | log_debug(">>> java_object: {0}".format(java_object), ident)
839 | return java_object
840 |
841 | def do_string(self, parent=None, ident=0):
842 | """
843 | Handles a TC_STRING opcode
844 |
845 | :param parent:
846 | :param ident: Log indentation level
847 | :return: A string
848 | """
849 | log_debug("[string]", ident)
850 | ba = JavaString(self._readString())
851 | self._add_reference(ba, ident)
852 | return ba
853 |
854 | def do_string_long(self, parent=None, ident=0):
855 | """
856 | Handles a TC_LONGSTRING opcode
857 |
858 | :param parent:
859 | :param ident: Log indentation level
860 | :return: A string
861 | """
862 | log_debug("[long string]", ident)
863 | ba = JavaString(self._readString("Q"))
864 | self._add_reference(ba, ident)
865 | return ba
866 |
867 | def do_array(self, parent=None, ident=0):
868 | """
869 | Handles a TC_ARRAY opcode
870 |
871 | :param parent:
872 | :param ident: Log indentation level
873 | :return: A list of deserialized objects
874 | """
875 | # TC_ARRAY classDesc newHandle (int) values[size]
876 | log_debug("[array]", ident)
877 | _, classdesc = self._read_and_exec_opcode(
878 | ident=ident + 1,
879 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC,
880 | self.TC_NULL, self.TC_REFERENCE))
881 |
882 | array = JavaArray(classdesc)
883 |
884 | self._add_reference(array, ident)
885 |
886 | (size,) = self._readStruct(">i")
887 | log_debug("size: {0}".format(size), ident)
888 |
889 | type_char = classdesc.name[0]
890 | assert type_char == self.TYPE_ARRAY
891 | type_char = classdesc.name[1]
892 |
893 | if type_char == self.TYPE_OBJECT or type_char == self.TYPE_ARRAY:
894 | for _ in range(size):
895 | _, res = self._read_and_exec_opcode(ident=ident + 1)
896 | log_debug("Object value: {0}".format(res), ident)
897 | array.append(res)
898 | elif type_char == self.TYPE_BYTE:
899 | if self.bytes_callback is not None:
900 | array = self.bytes_callback(self.object_stream, size)
901 | else:
902 | array = self.object_stream.read(size)
903 | else:
904 | for _ in range(size):
905 | res = self._read_value(type_char, ident)
906 | log_debug("Native value: {0}".format(res), ident)
907 | array.append(res)
908 |
909 | return array
910 |
911 | def do_reference(self, parent=None, ident=0):
912 | """
913 | Handles a TC_REFERENCE opcode
914 |
915 | :param parent:
916 | :param ident: Log indentation level
917 | :return: The referenced object
918 | """
919 | (handle,) = self._readStruct(">L")
920 | log_debug("## Reference handle: 0x{0:X}".format(handle), ident)
921 | ref = self.references[handle - self.BASE_REFERENCE_IDX]
922 | log_debug("###-> Type: {0} - Value: {1}".format(type(ref), ref), ident)
923 | return ref
924 |
925 | @staticmethod
926 | def do_null(parent=None, ident=0):
927 | """
928 | Handles a TC_NULL opcode
929 |
930 | :param parent:
931 | :param ident: Log indentation level
932 | :return: Always None
933 | """
934 | return None
935 |
936 | def do_enum(self, parent=None, ident=0):
937 | """
938 | Handles a TC_ENUM opcode
939 |
940 | :param parent:
941 | :param ident: Log indentation level
942 | :return: A JavaEnum object
943 | """
944 | # TC_ENUM classDesc newHandle enumConstantName
945 | enum = JavaEnum()
946 | _, classdesc = self._read_and_exec_opcode(
947 | ident=ident + 1,
948 | expect=(self.TC_CLASSDESC, self.TC_PROXYCLASSDESC,
949 | self.TC_NULL, self.TC_REFERENCE))
950 | enum.classdesc = classdesc
951 | self._add_reference(enum, ident)
952 | _, enumConstantName = self._read_and_exec_opcode(
953 | ident=ident + 1, expect=(self.TC_STRING, self.TC_REFERENCE))
954 | enum.constant = enumConstantName
955 | return enum
956 |
957 | @staticmethod
958 | def _create_hexdump(src, start_offset=0, length=16):
959 | """
960 | Prepares an hexadecimal dump string
961 |
962 | :param src: A string containing binary data
963 | :param start_offset: The start offset of the source
964 | :param length: Length of a dump line
965 | :return: A dump string
966 | """
967 | FILTER = ''.join((len(repr(chr(x))) == 3) and chr(x) or '.'
968 | for x in range(256))
969 | pattern = "{{0:04X}} {{1:<{0}}} {{2}}\n".format(length * 3)
970 |
971 | # Convert raw data to str (Python 3 compatibility)
972 | src = to_str(src, 'latin-1')
973 |
974 | result = []
975 | for i in range(0, len(src), length):
976 | s = src[i:i + length]
977 | hexa = ' '.join("{0:02X}".format(ord(x)) for x in s)
978 | printable = s.translate(FILTER)
979 | result.append(pattern.format(i + start_offset, hexa, printable))
980 |
981 | return ''.join(result)
982 |
983 | def _read_value(self, field_type, ident, name=""):
984 | """
985 | Reads the next value, of the given type
986 |
987 | :param field_type: A serialization typecode
988 | :param ident: Log indentation
989 | :param name: Field name (for logs)
990 | :return: The read value
991 | :raise RuntimeError: Unknown field type
992 | """
993 | if len(field_type) > 1:
994 | # We don't need details for arrays and objects
995 | field_type = field_type[0]
996 |
997 | if field_type == self.TYPE_BOOLEAN:
998 | (val,) = self._readStruct(">B")
999 | res = bool(val)
1000 | elif field_type == self.TYPE_BYTE:
1001 | (res,) = self._readStruct(">b")
1002 | elif field_type == self.TYPE_CHAR:
1003 | (res,) = self._readStruct(">c")
1004 | elif field_type == self.TYPE_SHORT:
1005 | (res,) = self._readStruct(">h")
1006 | elif field_type == self.TYPE_INTEGER:
1007 | (res,) = self._readStruct(">i")
1008 | elif field_type == self.TYPE_LONG:
1009 | (res,) = self._readStruct(">q")
1010 | elif field_type == self.TYPE_FLOAT:
1011 | (res,) = self._readStruct(">f")
1012 | elif field_type == self.TYPE_DOUBLE:
1013 | (res,) = self._readStruct(">d")
1014 | elif field_type == self.TYPE_OBJECT or field_type == self.TYPE_ARRAY:
1015 | _, res = self._read_and_exec_opcode(ident=ident + 1)
1016 | else:
1017 | raise RuntimeError("Unknown typecode: {0}".format(field_type))
1018 |
1019 | log_debug("* {0} {1}: {2}".format(field_type, name, res), ident)
1020 | return res
1021 |
1022 | def _convert_char_to_type(self, type_char):
1023 | """
1024 | Ensures a read character is a typecode.
1025 |
1026 | :param type_char: Read typecode
1027 | :return: The typecode as a string (using chr)
1028 | :raise RuntimeError: Unknown typecode
1029 | """
1030 | typecode = type_char
1031 | if type(type_char) is int:
1032 | typecode = chr(type_char)
1033 |
1034 | if typecode in self.TYPECODES_LIST:
1035 | return typecode
1036 | else:
1037 | raise RuntimeError("Typecode {0} ({1}) isn't supported."
1038 | .format(type_char, typecode))
1039 |
1040 | def _add_reference(self, obj, ident=0):
1041 | """
1042 | Adds a read reference to the marshaler storage
1043 |
1044 | :param obj: Reference to add
1045 | :param ident: Log indentation level
1046 | """
1047 | log_debug("## New reference handle 0x{0:X}: {1} -> {2}"
1048 | .format(len(self.references) + self.BASE_REFERENCE_IDX,
1049 | type(obj).__name__, obj), ident)
1050 | self.references.append(obj)
1051 |
1052 | def _oops_dump_state(self, ignore_remaining_data=False):
1053 | """
1054 | Log a deserialization error
1055 |
1056 | :param ignore_remaining_data: If True, don't log an error when
1057 | unused trailing bytes are remaining
1058 | """
1059 | log_error("==Oops state dump" + "=" * (30 - 17))
1060 | log_error("References: {0}".format(self.references))
1061 | log_error("Stream seeking back at -16 byte (2nd line is an actual "
1062 | "position!):")
1063 |
1064 | # Do not use a keyword argument
1065 | self.object_stream.seek(-16, os.SEEK_CUR)
1066 | position = self.object_stream.tell()
1067 | the_rest = self.object_stream.read()
1068 |
1069 | if not ignore_remaining_data and len(the_rest):
1070 | log_error("Warning!!!!: Stream still has {0} bytes left."
1071 | .format(len(the_rest)))
1072 | log_error(self._create_hexdump(the_rest, position))
1073 |
1074 | log_error("=" * 30)
1075 |
1076 | # ------------------------------------------------------------------------------
1077 |
1078 |
1079 | class JavaObjectMarshaller(JavaObjectConstants):
1080 | """
1081 | Serializes objects into Java serialization format
1082 | """
1083 | def __init__(self, stream=None):
1084 | """
1085 | Sets up members
1086 |
1087 | :param stream: An output stream
1088 | """
1089 | self.object_stream = stream
1090 | self.object_obj = None
1091 | self.object_transformers = []
1092 | self.references = []
1093 |
1094 | def add_transformer(self, transformer):
1095 | """
1096 | Appends an object transformer to the serialization process
1097 |
1098 | :param transformer: An object with a transform(obj) method
1099 | """
1100 | self.object_transformers.append(transformer)
1101 |
1102 | def dump(self, obj):
1103 | """
1104 | Dumps the given object in the Java serialization format
1105 | """
1106 | self.references = []
1107 | self.object_obj = obj
1108 | self.object_stream = BytesIO()
1109 | self._writeStreamHeader()
1110 | self.writeObject(obj)
1111 | return self.object_stream.getvalue()
1112 |
1113 | def _writeStreamHeader(self):
1114 | """
1115 | Writes the Java serialization magic header in the serialization stream
1116 | """
1117 | self._writeStruct(">HH", 4, (self.STREAM_MAGIC, self.STREAM_VERSION))
1118 |
1119 | def writeObject(self, obj):
1120 | """
1121 | Appends an object to the serialization stream
1122 |
1123 | :param obj: A string or a deserialized Java object
1124 | :raise RuntimeError: Unsupported type
1125 | """
1126 | log_debug("Writing object of type {0}".format(type(obj).__name__))
1127 | if isinstance(obj, JavaArray):
1128 | # Deserialized Java array
1129 | self.write_array(obj)
1130 | elif isinstance(obj, JavaEnum):
1131 | # Deserialized Java Enum
1132 | self.write_enum(obj)
1133 | elif isinstance(obj, JavaObject):
1134 | # Deserialized Java object
1135 | self.write_object(obj)
1136 | elif isinstance(obj, JavaString):
1137 | # Deserialized String
1138 | self.write_string(obj)
1139 | elif isinstance(obj, JavaClass):
1140 | # Java class
1141 | self.write_class(obj)
1142 | elif obj is None:
1143 | # Null
1144 | self.write_null()
1145 | elif type(obj) is str:
1146 | # String value
1147 | self.write_blockdata(obj)
1148 | else:
1149 | # Unhandled type
1150 | raise RuntimeError("Object serialization of type {0} is not "
1151 | "supported.".format(type(obj)))
1152 |
1153 | def _writeStruct(self, unpack, length, args):
1154 | """
1155 | Appends data to the serialization stream
1156 |
1157 | :param unpack: Struct format string
1158 | :param length: Unused
1159 | :param args: Struct arguments
1160 | """
1161 | ba = struct.pack(unpack, *args)
1162 | self.object_stream.write(ba)
1163 |
1164 | def _writeString(self, obj, use_reference=True):
1165 | """
1166 | Appends a string to the serialization stream
1167 |
1168 | :param obj: String to serialize
1169 | :param use_reference: If True, allow writing a reference
1170 | """
1171 | # TODO: Convert to "modified UTF-8"
1172 | # http://docs.oracle.com/javase/7/docs/api/java/io/DataInput.html#modified-utf-8
1173 | string = to_bytes(obj, "utf-8")
1174 |
1175 | if use_reference and isinstance(obj, JavaString):
1176 | try:
1177 | idx = self.references.index(obj)
1178 | except ValueError:
1179 | # First appearance of the string
1180 | self.references.append(obj)
1181 | logging.debug(
1182 | "*** Adding ref 0x%X for string: %s",
1183 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj)
1184 |
1185 | self._writeStruct(">H", 2, (len(string),))
1186 | self.object_stream.write(string)
1187 | else:
1188 | # Write a reference to the previous type
1189 | logging.debug("*** Reusing ref 0x%X for string: %s",
1190 | idx + self.BASE_REFERENCE_IDX, obj)
1191 | self.write_reference(idx)
1192 | else:
1193 | self._writeStruct(">H", 2, (len(string),))
1194 | self.object_stream.write(string)
1195 |
1196 | def write_string(self, obj, use_reference=True):
1197 | """
1198 | Writes a Java string with the TC_STRING type marker
1199 |
1200 | :param obj: The string to print
1201 | :param use_reference: If True, allow writing a reference
1202 | """
1203 | if use_reference and isinstance(obj, JavaString):
1204 | try:
1205 | idx = self.references.index(obj)
1206 | except ValueError:
1207 | # String is not referenced: let _writeString store it
1208 | self._writeStruct(">B", 1, (self.TC_STRING,))
1209 | self._writeString(obj, use_reference)
1210 | else:
1211 | # Reuse the referenced string
1212 | logging.debug("*** Reusing ref 0x%X for String: %s",
1213 | idx + self.BASE_REFERENCE_IDX, obj)
1214 | self.write_reference(idx)
1215 | else:
1216 | # Don't use references
1217 | self._writeStruct(">B", 1, (self.TC_STRING,))
1218 | self._writeString(obj, use_reference)
1219 |
1220 | def write_enum(self, obj):
1221 | """
1222 | Writes an Enum value
1223 |
1224 | :param obj: A JavaEnum object
1225 | """
1226 | # FIXME: the output doesn't have the same references as the real
1227 | # serializable form
1228 | self._writeStruct(">B", 1, (self.TC_ENUM,))
1229 |
1230 | try:
1231 | idx = self.references.index(obj)
1232 | except ValueError:
1233 | # New reference
1234 | self.references.append(obj)
1235 | logging.debug(
1236 | "*** Adding ref 0x%X for enum: %s",
1237 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj)
1238 |
1239 | self.write_classdesc(obj.get_class())
1240 | else:
1241 | self.write_reference(idx)
1242 |
1243 | self.write_string(obj.constant)
1244 |
1245 | def write_blockdata(self, obj, parent=None):
1246 | """
1247 | Appends a block of data to the serialization stream
1248 |
1249 | :param obj: String form of the data block
1250 | """
1251 | if type(obj) is str:
1252 | # Latin-1: keep bytes as is
1253 | obj = to_bytes(obj, "latin-1")
1254 |
1255 | length = len(obj)
1256 | if length <= 256:
1257 | # Small block data
1258 | # TC_BLOCKDATA (unsigned byte) (byte)[size]
1259 | self._writeStruct(">B", 1, (self.TC_BLOCKDATA,))
1260 | self._writeStruct(">B", 1, (length,))
1261 | else:
1262 | # Large block data
1263 | # TC_BLOCKDATALONG (unsigned int) (byte)[size]
1264 | self._writeStruct(">B", 1, (self.TC_BLOCKDATALONG,))
1265 | self._writeStruct(">I", 1, (length,))
1266 |
1267 | self.object_stream.write(obj)
1268 |
1269 | def write_null(self):
1270 | """
1271 | Writes a "null" value
1272 | """
1273 | self._writeStruct(">B", 1, (self.TC_NULL,))
1274 |
1275 | def write_object(self, obj, parent=None):
1276 | """
1277 | Writes an object header to the serialization stream
1278 |
1279 | :param obj: Not yet used
1280 | :param parent: Not yet used
1281 | """
1282 | # Transform object
1283 | for transformer in self.object_transformers:
1284 | tmp_object = transformer.transform(obj)
1285 | if tmp_object is not obj:
1286 | obj = tmp_object
1287 | break
1288 |
1289 | self._writeStruct(">B", 1, (self.TC_OBJECT,))
1290 | cls = obj.get_class()
1291 | self.write_classdesc(cls)
1292 |
1293 | # Add reference
1294 | self.references.append([])
1295 | logging.debug(
1296 | "*** Adding ref 0x%X for object %s",
1297 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj)
1298 |
1299 | all_names = collections.deque()
1300 | all_types = collections.deque()
1301 | tmpcls = cls
1302 | while tmpcls:
1303 | all_names.extendleft(reversed(tmpcls.fields_names))
1304 | all_types.extendleft(reversed(tmpcls.fields_types))
1305 | tmpcls = tmpcls.superclass
1306 | del tmpcls
1307 |
1308 | logging.debug("<=> Field names: %s", all_names)
1309 | logging.debug("<=> Field types: %s", all_types)
1310 |
1311 | for field_name, field_type in zip(all_names, all_types):
1312 | try:
1313 | logging.debug("Writing field %s (%s): %s",
1314 | field_name, field_type, getattr(obj, field_name))
1315 | self._write_value(field_type, getattr(obj, field_name))
1316 | except AttributeError as ex:
1317 | log_error("No attribute {0} for object {1}\nDir: {2}"
1318 | .format(ex, repr(obj), dir(obj)))
1319 | raise
1320 | del all_names, all_types
1321 |
1322 | if cls.flags & self.SC_SERIALIZABLE \
1323 | and cls.flags & self.SC_WRITE_METHOD \
1324 | or cls.flags & self.SC_EXTERNALIZABLE \
1325 | and cls.flags & self.SC_BLOCK_DATA:
1326 | for annotation in obj.annotations:
1327 | log_debug("Write annotation {0} for {1}"
1328 | .format(repr(annotation), repr(obj)))
1329 | if annotation is None:
1330 | self.write_null()
1331 | else:
1332 | self.writeObject(annotation)
1333 | self._writeStruct('>B', 1, (self.TC_ENDBLOCKDATA,))
1334 |
1335 | def write_class(self, obj, parent=None):
1336 | """
1337 | Writes a class to the stream
1338 |
1339 | :param obj: A JavaClass object
1340 | :param parent:
1341 | """
1342 | self._writeStruct(">B", 1, (self.TC_CLASS,))
1343 | self.write_classdesc(obj)
1344 |
1345 | def write_classdesc(self, obj, parent=None):
1346 | """
1347 | Writes a class description
1348 |
1349 | :param obj: Class description to write
1350 | :param parent:
1351 | """
1352 | if obj not in self.references:
1353 | # Add reference
1354 | self.references.append(obj)
1355 | logging.debug(
1356 | "*** Adding ref 0x%X for classdesc %s",
1357 | len(self.references) - 1 + self.BASE_REFERENCE_IDX, obj.name)
1358 |
1359 | self._writeStruct(">B", 1, (self.TC_CLASSDESC,))
1360 | self._writeString(obj.name)
1361 | self._writeStruct(">qB", 1, (obj.serialVersionUID, obj.flags))
1362 | self._writeStruct(">H", 1, (len(obj.fields_names),))
1363 |
1364 | for field_name, field_type \
1365 | in zip(obj.fields_names, obj.fields_types):
1366 | self._writeStruct(
1367 | ">B", 1, (self._convert_type_to_char(field_type),))
1368 | self._writeString(field_name)
1369 | if field_type[0] in (self.TYPE_OBJECT, self.TYPE_ARRAY):
1370 | try:
1371 | idx = self.references.index(field_type)
1372 | except ValueError:
1373 | # First appearance of the type
1374 | self.references.append(field_type)
1375 | logging.debug(
1376 | "*** Adding ref 0x%X for field type %s",
1377 | len(self.references) - 1 + self.BASE_REFERENCE_IDX,
1378 | field_type)
1379 |
1380 | self.write_string(field_type, False)
1381 | else:
1382 | # Write a reference to the previous type
1383 | logging.debug("*** Reusing ref 0x%X for %s (%s)",
1384 | idx + self.BASE_REFERENCE_IDX,
1385 | field_type, field_name)
1386 | self.write_reference(idx)
1387 |
1388 | self._writeStruct(">B", 1, (self.TC_ENDBLOCKDATA,))
1389 | if obj.superclass:
1390 | self.write_classdesc(obj.superclass)
1391 | else:
1392 | self.write_null()
1393 | else:
1394 | # Use reference
1395 | self.write_reference(self.references.index(obj))
1396 |
1397 | def write_reference(self, ref_index):
1398 | """
1399 | Writes a reference
1400 | :param ref_index: Local index (0-based) to the reference
1401 | """
1402 | self._writeStruct(
1403 | ">BL", 1, (self.TC_REFERENCE, ref_index + self.BASE_REFERENCE_IDX))
1404 |
1405 | def write_array(self, obj):
1406 | """
1407 | Writes a JavaArray
1408 |
1409 | :param obj: A JavaArray object
1410 | """
1411 | classdesc = obj.get_class()
1412 | self._writeStruct(">B", 1, (self.TC_ARRAY,))
1413 | self.write_classdesc(classdesc)
1414 | self._writeStruct(">i", 1, (len(obj),))
1415 |
1416 | # Add reference
1417 | self.references.append(obj)
1418 | logging.debug(
1419 | "*** Adding ref 0x%X for array []",
1420 | len(self.references) - 1 + self.BASE_REFERENCE_IDX)
1421 |
1422 | type_char = classdesc.name[0]
1423 | assert type_char == self.TYPE_ARRAY
1424 | type_char = classdesc.name[1]
1425 |
1426 | if type_char == self.TYPE_OBJECT:
1427 | for o in obj:
1428 | self._write_value(classdesc.name[1:], o)
1429 | elif type_char == self.TYPE_ARRAY:
1430 | for a in obj:
1431 | self.write_array(a)
1432 | else:
1433 | log_debug("Write array of type %s" % type_char)
1434 | for v in obj:
1435 | self._write_value(type_char, v)
1436 |
1437 | def _write_value(self, field_type, value):
1438 | """
1439 | Writes an item of an array
1440 |
1441 | :param field_type: Value type
1442 | :param value: The value itself
1443 | """
1444 | if len(field_type) > 1:
1445 | # We don't need details for arrays and objects
1446 | field_type = field_type[0]
1447 |
1448 | if field_type == self.TYPE_BOOLEAN:
1449 | self._writeStruct(">B", 1, (1 if value else 0,))
1450 | elif field_type == self.TYPE_BYTE:
1451 | self._writeStruct(">b", 1, (value,))
1452 | elif field_type == self.TYPE_SHORT:
1453 | self._writeStruct(">h", 1, (value,))
1454 | elif field_type == self.TYPE_INTEGER:
1455 | self._writeStruct(">i", 1, (value,))
1456 | elif field_type == self.TYPE_LONG:
1457 | self._writeStruct(">q", 1, (value,))
1458 | elif field_type == self.TYPE_FLOAT:
1459 | self._writeStruct(">f", 1, (value,))
1460 | elif field_type == self.TYPE_DOUBLE:
1461 | self._writeStruct(">d", 1, (value,))
1462 | elif field_type == self.TYPE_OBJECT or field_type == self.TYPE_ARRAY:
1463 | if value is None:
1464 | self.write_null()
1465 | elif isinstance(value, JavaEnum):
1466 | self.write_enum(value)
1467 | elif isinstance(value, JavaArray):
1468 | self.write_array(value)
1469 | elif isinstance(value, JavaObject):
1470 | self.write_object(value)
1471 | elif isinstance(value, JavaString):
1472 | self.write_string(value)
1473 | elif isinstance(value, str):
1474 | self.write_blockdata(value)
1475 | else:
1476 | raise RuntimeError("Unknown typecode: {0}".format(field_type))
1477 | else:
1478 | raise RuntimeError("Unknown typecode: {0}".format(field_type))
1479 |
1480 | def _convert_type_to_char(self, type_char):
1481 | """
1482 | Converts the given type code to an int
1483 |
1484 | :param type_char: A type code character
1485 | """
1486 | typecode = type_char
1487 | if type(type_char) is int:
1488 | typecode = chr(type_char)
1489 |
1490 | if typecode in self.TYPECODES_LIST:
1491 | return ord(typecode)
1492 | elif len(typecode) > 1:
1493 | if typecode[0] == 'L':
1494 | return ord(self.TYPE_OBJECT)
1495 | elif typecode[0] == '[':
1496 | return ord(self.TYPE_ARRAY)
1497 |
1498 | raise RuntimeError("Typecode {0} ({1}) isn't supported."
1499 | .format(type_char, typecode))
1500 |
1501 | # ------------------------------------------------------------------------------
1502 |
1503 |
1504 | class DefaultObjectTransformer(object):
1505 | """
1506 | Default transformer for the deserialized objects.
1507 | Converts JavaObject objects to Python types (maps, lists, ...)
1508 | """
1509 | class JavaList(list, JavaObject):
1510 | def __init__(self, *args, **kwargs):
1511 | list.__init__(self, *args, **kwargs)
1512 | JavaObject.__init__(self)
1513 |
1514 | class JavaMap(dict, JavaObject):
1515 | def __init__(self, *args, **kwargs):
1516 | dict.__init__(self, *args, **kwargs)
1517 | JavaObject.__init__(self)
1518 |
1519 | def create(self, classdesc):
1520 | """
1521 | Transforms a deserialized Java object into a Python object
1522 |
1523 | :param java_object: A JavaObject instance
1524 | :return: The Python form of the object, or the original JavaObject
1525 | """
1526 |
1527 | if classdesc.name in ("java.util.ArrayList", "java.util.LinkedList"):
1528 | # @serialData The length of the array backing the
1529 | # ArrayList instance is emitted (int),
1530 | # followed by all of its elements
1531 | # (each an Object) in the proper order
1532 | log_debug("---")
1533 | log_debug(classdesc.name)
1534 | log_debug("---")
1535 |
1536 | java_object = self.JavaList()
1537 |
1538 | log_debug(">>> java_object: {0}".format(java_object))
1539 | return java_object
1540 |
1541 | if classdesc.name == "java.util.HashMap":
1542 | log_debug("---")
1543 | log_debug("java.util.HashMap")
1544 | log_debug("---")
1545 |
1546 | java_object = self.JavaMap()
1547 |
1548 | log_debug(">>> java_object: {0}".format(java_object))
1549 | return java_object
1550 |
1551 | # Return a JavaObject by default
1552 | return JavaObject()
1553 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 |
4 | setup(
5 | name="sparkpickle",
6 | description="Provides functions for reading SequenceFile-s with Python "
7 | "pickles.",
8 | version="1.0.1",
9 | license="Apache 2.0",
10 | author="Vadim Markovtsev",
11 | author_email="vadim@sourced.tech",
12 | url="https://github.com/src-d/sparkpickle",
13 | download_url='https://github.com/src-d/sparkpickle',
14 | packages=["sparkpickle"],
15 | package_dir={"sparkpickle": "."},
16 | exclude=["test.py"],
17 | keywords=["spark", "pyspark", "hadoop", "rdd", "pickle"],
18 | install_requires=[],
19 | package_data={"": ["LICENSE", "README.md"]},
20 | classifiers=[
21 | "Development Status :: 3 - Alpha",
22 | "Environment :: Console",
23 | "Intended Audience :: Developers",
24 | "License :: OSI Approved :: Apache Software License",
25 | "Operating System :: POSIX",
26 | "Programming Language :: Python :: 2.7",
27 | "Programming Language :: Python :: 3.2",
28 | "Programming Language :: Python :: 3.3",
29 | "Programming Language :: Python :: 3.4",
30 | "Topic :: Software Development :: Libraries"
31 | ]
32 | )
--------------------------------------------------------------------------------
/test.2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/test.2.bin
--------------------------------------------------------------------------------
/test.3.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/sparkpickle/648bf2e7bd9b79679d44a8d01dc796285e881114/test.3.bin
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 |
5 | import sparkpickle
6 |
7 |
8 | class SparkPickleTests(unittest.TestCase):
9 | def test_load(self):
10 | with open(os.path.join(os.path.dirname(__file__),
11 | "test.%d.bin" % sys.version_info[0]),
12 | "rb") as fin:
13 | objs = sparkpickle.load(fin)
14 | self.assertEqual(objs, list(range(200)))
15 |
16 | if __name__ == "__main__":
17 | unittest.main()
18 |
--------------------------------------------------------------------------------