├── .gitignore
├── LICENSE
├── NOTICE
├── README.md
├── add_jar_to_build.sh
├── add_tgz_to_build.sh
├── examples
    ├── README.md
    ├── cache.py
    ├── callback.py
    ├── copy_data_to_hdfs.sh
    ├── joins.py
    ├── map_types.py
    ├── merge_streams.py
    ├── pagerank.py
    ├── pycascading_data
    │   ├── graph.txt
    │   ├── lhs.txt
    │   ├── repeats.txt
    │   ├── rhs.txt
    │   └── town.txt
    ├── python_fields.py
    ├── reduce.py
    ├── subassembly.py
    ├── total_sort.py
    ├── udf_contexts.py
    └── word_count.py
├── java
    ├── build.xml
    ├── dependencies.properties
    └── src
    │   └── com
    │       └── twitter
    │           └── pycascading
    │               ├── CascadingAggregatorWrapper.java
    │               ├── CascadingBaseOperationWrapper.java
    │               ├── CascadingBufferWrapper.java
    │               ├── CascadingFilterWrapper.java
    │               ├── CascadingFunctionWrapper.java
    │               ├── CascadingRecordProducerWrapper.java
    │               ├── Main.java
    │               ├── MetaScheme.java
    │               ├── PythonEnvironment.java
    │               ├── PythonObjectInputStream.java
    │               ├── PythonObjectOutputStream.java
    │               ├── SelectFields.java
    │               ├── SerializedPythonFunction.java
    │               ├── TemporaryHdfs.java
    │               ├── Util.java
    │               ├── bigintegerserialization
    │                   ├── BigIntegerComparator.java
    │                   ├── BigIntegerDeserializer.java
    │                   ├── BigIntegerSerialization.java
    │                   └── BigIntegerSerializer.java
    │               └── pythonserialization
    │                   ├── PythonDeserializer.java
    │                   ├── PythonSerialization.java
    │                   └── PythonSerializer.java
├── local_run.sh
├── python
    └── pycascading
    │   ├── __init__.py
    │   ├── bootstrap.py
    │   ├── cogroup.py
    │   ├── decorators.py
    │   ├── each.py
    │   ├── every.py
    │   ├── helpers.py
    │   ├── init_module.py
    │   ├── native.py
    │   ├── operators.py
    │   ├── pipe.py
    │   ├── serializers.py
    │   └── tap.py
└── remote_deploy.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | .gitignore
 2 | build/*
 3 | *.class
 4 | *.jar
 5 | *.pyc
 6 | *~
 7 | examples/pycascading_data/out*/
 8 | examples/pycascading_data/maps/
 9 | examples/pycascading.cache/
10 | .settings/
11 | .classpath
12 | .project
13 | .pydevproject
14 | private/*
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | pycascading is a python wrapper for cascading.
 2 | Copyright 2011 Twitter, Inc.
 3 | 
 4 | This software has the follow third party dependencies:
 5 | 
 6 | Jython 2.5.2
 7 | http://www.jython.org/
 8 | Python Software Foundation License 2.0
 9 | 
10 | Cascading 1.2.4
11 | http://www.cascading.org/
12 | GPL 2.0
13 | 
14 | Hadoop 0.20.2
15 | http://hadoop.apache.org/
16 | APL 2.0
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PyCascading is no longer maintained
  2 | ===================================
  3 | 
  4 | PyCascading
  5 | ===========
  6 | 
  7 | PyCascading is a Python wrapper for Cascading. You can control the
  8 | full data processing workflow from Python.
  9 | 
 10 | * Pipelines are built with Python operators
 11 | * User-defined functions are written in Python
 12 | * Passing arbitrary contexts to user-defined functions
 13 | * Caching of interim results in pipes for faster replay
 14 | * Uses Jython 2.5.2, easy integration with Java and Python libraries
 15 | 
 16 | 
 17 | Examples
 18 | --------
 19 | 
 20 | There can't be a MapReduce tutorial without counting words. Here it is:
 21 | 
 22 |     def main():
 23 |         ...
 24 | 
 25 |         @udf_map(produces=['word'])
 26 |         def split_words(tuple):
 27 |             for word in tuple.get('line').split():
 28 |                 yield [word]
 29 | 
 30 |         input | split_words | group_by('word', native.count()) | output
 31 |         ...
 32 | 
 33 | Above, the user-defined function that reshapes the stream is annotated with
 34 | a PyCascading decorator, and the workflow is created by chaining operations
 35 | into each other.
 36 | 
 37 | More examples for the different use cases can be found in the examples folder.
 38 | See also the docstrings in the sources for a complete documentation of the
 39 | arguments.
 40 | 
 41 | To try the examples, first build the Java sources as described below in the
 42 | Building section. Then, change to the 'examples' folder, and issue either
 43 | 
 44 | ../local_run.sh example.py
 45 | 
 46 | for a simulated Hadoop local run, or
 47 | 
 48 | ../remote_deploy.sh -m -s hadoop_server example.py
 49 | 
 50 | to deploy automatically on a Hadoop server. hadoop_server is the SSH address
 51 | of an account where the master jar and script will be scp'd to. Note that the
 52 | '-m' option has to be used only once in the beginning. The '-m' option copies
 53 | the master jar to the server, and any subsequent deploys will use this master
 54 | jar, and only the actual Python script will be copied over the network.
 55 | 
 56 | 
 57 | Usage
 58 | -----
 59 | 
 60 | PyCascading may be used in one of two modes: in local Hadoop mode or with
 61 | remote Hadoop deployment. Please note that you need to specify the locations
 62 | of the dependencies in the java/dependencies.properties file.
 63 | 
 64 | In *local mode*, the script is executed in Hadoop's local mode. All files
 65 | reside on the local file system, and creating a bundled deployment jar is not
 66 | necessary.
 67 | 
 68 | To run in this mode, use the script *local_run.sh*, with the first parameter
 69 | being the PyCascading script. Additional command line parameters may be used
 70 | to pass on to the script.
 71 | 
 72 | In *Hadoop mode*, we assume that Hadoop runs on a remote SSH server (or
 73 | localhost). First, a master jar is built and copied to the server. This jar
 74 | contains all the PyCascading classes and other dependencies (but not Hadoop)
 75 | needed to run a job, and may get rather large if there are a few external jars
 76 | included. For this reason it is copied to the Hadoop deployment server only
 77 | once, and whenever a new PyCascading script is run by the user, only the
 78 | Pythn script is copied to the remote server and bundled there for submission
 79 | to Hadoop. The first few variables in the remote_deploy.sh script specify
 80 | the Hadoop server and the folders where the deployment files should be placed. 
 81 | 
 82 | Use the remote_deploy.sh script to deploy a PyCascading script to the remote
 83 | Hadoop server.
 84 | 
 85 | 
 86 | Building
 87 | --------
 88 | 
 89 | Requirements for building:
 90 | 
 91 | * Cascading 1.2.* or 2.0.0 (http://www.concurrentinc.com/downloads/)
 92 | * Jython 2.5.2+ (http://www.jython.org/downloads.html)
 93 | * Hadoop 0.20.2+, the version preferably matching the Hadoop runtime
 94 | (http://www.apache.org/dyn/closer.cgi/hadoop/common/)
 95 | * A Java compiler
 96 | * Ant (http://ant.apache.org/)
 97 | 
 98 | Requirements for running:
 99 | 
100 | * Hadoop installed and set up on the target server (http://hadoop.apache.org/)
101 | * SSH access to the remote server
102 | * If testing scripts locally, a reasonable JVM callable by "java"
103 | 
104 | PyCascading consists of Java and Python sources. Python sources need no
105 | compiling, but the Java part needs to be built with Ant. For this, change to
106 | the 'java' folder, and invoke ant. This should build the sources and create
107 | a master jar for job submission.
108 | 
109 | The locations of the Jython, Cascading, and Hadoop folders on the file system
110 | are specified in the java/dependencies.properties file. You need to correctly
111 | specify these before compiling the source.
112 | 
113 | Also, check the remote_deploy.sh script and the locations defined in the
114 | beginning of that file on where to put the jar files on the Hadoop server.
115 | 
116 | 
117 | Bugs
118 | ----
119 | 
120 | Have a bug or feature request? Please create an issue here on GitHub!
121 | 
122 | https://github.com/twitter/pycascading/issues
123 | 
124 | 
125 | Mailing list
126 | ------------
127 | 
128 | Currently we are using the cascading-user mailing list for discussions. Any
129 | questions, please ask there.
130 | 
131 | http://groups.google.com/group/cascading-user
132 | 
133 | 
134 | Authors
135 | -------
136 | 
137 | **Gabor Szabo**
138 | 
139 | + http://twitter.com/gaborjszabo
140 | 
141 | License
142 | ---------------------
143 | 
144 | Copyright 2011 Twitter, Inc.
145 | 
146 | Licensed under the Apache License, Version 2.0
147 | 
148 | http://www.apache.org/licenses/LICENSE-2.0
149 | 


--------------------------------------------------------------------------------
/add_jar_to_build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Copyright 2011 Twitter, Inc.
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #
19 | # Extracts a jar file and adds its contents to the PyCascading jar build.
20 | #
21 | # We need to extract the jar's contents as we expect that it may contain
22 | # further jars, which would not be picked up if we didn't extract the
23 | # whole jar.
24 | #
25 | 
26 | usage()
27 | {
28 |     cat << EOF
29 | Usage: $0 <jar1> [<jar2> ...]
30 | 
31 | Adds the jar files to the main PyCascading jar. This is useful if we have our
32 | own or third party libraries that the PyCascading scripts use, and want to
33 | distribute these to the Hadoop server together with the PyCascading master jar.
34 | 
35 | The jar files can contain Java classes, further jars, and Python libraries.
36 | The Java classes should be in folders corresponding to their namespaces, as
37 | usual for jar files. The other Java library jars must be in a \'lib\' folder in
38 | the jar, and the Python imports must be in a \'python\' folder.
39 | 
40 | The MANIFEST file, if present, will be discarded.
41 | 
42 | Obviously, this script must be run after every new build of PyCascading for all
43 | the jars that should be added to the PyCascading build.
44 | 
45 | EOF
46 | }
47 | 
48 | if [ $# -eq 0 ]; then
49 |     usage
50 |     exit
51 | fi
52 | 
53 | home_dir=$(pwd)
54 | pycascading_dir=$(dirname "$0")
55 | 
56 | for j in "$@"; do
57 |     temp=$(mktemp -d -t PyCascading-tmp-XXXXXX)
58 |     cat "$j" | (cd "$temp"; jar x)
59 |     rm -rf "$temp/META-INF/MANIFEST.MF" 2>/dev/null
60 |     jar -uf "$pycascading_dir/build/pycascading.jar" -C "$temp" .
61 |     rm -rf "$temp"
62 | done
63 | 


--------------------------------------------------------------------------------
/add_tgz_to_build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Copyright 2011 Twitter, Inc.
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #
19 | # Extracts a jar file and adds its contents to the PyCascading jar build.
20 | #
21 | # We need to extract the jar's contents as we expect that it may contain
22 | # further jars, which would not be picked up if we didn't extract the
23 | # whole jar.
24 | #
25 | 
26 | usage()
27 | {
28 |     cat << EOF
29 | Usage: $0 <tgz1> [<tgz2> ...]
30 | 
31 | Adds the tgz files to the main PyCascading tgz. This is useful if we have our
32 | own or third party Python libraries that the PyCascading scripts use, and want to
33 | distribute these to the Hadoop server together with the PyCascading master tgz.
34 | 
35 | The tgz files can contain Python libraries that will be added to the search path.
36 | 
37 | Obviously, this script must be run after every new build of PyCascading for all
38 | the tgzs that should be added to the PyCascading build.
39 | 
40 | EOF
41 | }
42 | 
43 | if [ $# -eq 0 ]; then
44 |     usage
45 |     exit
46 | fi
47 | 
48 | home_dir=$(pwd)
49 | pycascading_dir=$(dirname "$0")
50 | 
51 | temp=$(mktemp -d -t PyCascading-tmp-XXXXXX)
52 | gzip -d <"$pycascading_dir/build/pycascading.tgz" >"$temp/pycascading.tar"
53 | for j in "$@"; do
54 |     gzip -d <"$j" >"$temp/archive.tar"
55 |     tar -A -f "$temp/pycascading.tar" "$temp/archive.tar"
56 | done
57 | gzip -c <"$temp/pycascading.tar" >"$pycascading_dir/build/pycascading.tgz"
58 | rm -rf "$temp"
59 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | PyCascading examples
 2 | ====================
 3 | 
 4 | This folder showcases a number of features offered by Cascading and
 5 | PyCascading. They use input files in the 'pycascading\_data' folder, so
 6 | before running the examples, make sure that:
 7 | 
 8 | * in local mode, you cd first to the examples/ directory (or wherever
 9 | pycascading\_data/ is found), and use local\_run.sh to run the example like
10 | * in Hadoop mode, you copy the data folder to HDFS first by running
11 | copy\_data\_to\_hdfs.sh, or
12 | 
13 | 	hadoop fs -put pycascading\_data pycascading\_data
14 | 
15 |     and then invoke remote\_deploy.sh
16 | 


--------------------------------------------------------------------------------
/examples/cache.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example showing how to use caches.
17 | 
18 | A cache saves the result of an operation to a temporary folder, and running
19 | the same script again will take the data from the cached files, instead of
20 | executing the original pipe again. Try to run this job several times with
21 | different separators: after the first run, the checkpointed state will be
22 | used for subsequent runs.
23 | 
24 | This is useful if we want to repeatedly run the script with modifications
25 | to parts that do not change the cached results.
26 | 
27 | For this script, the first run will have two MR jobs, but any subsequent runs
28 | will only have one, as the
29 | """
30 | 
31 | import sys
32 | from pycascading.helpers import *
33 | 
34 | 
35 | @udf_map
36 | def find_lines_with_beginning(tuple, first_char):
37 |     try:
38 |         if tuple.get(1)[0] == first_char:
39 |             return [tuple.get(1)]
40 |     except:
41 |         pass
42 | 
43 | 
44 | @udf_buffer
45 | def concat_all(group, tuples, separator):
46 |     out = ''
47 |     for tuple in tuples:
48 |         try:
49 |             out = out + tuple.get(0) + separator
50 |         except:
51 |             pass
52 |     return [out]
53 | 
54 | 
55 | def main():
56 |     if len(sys.argv) < 2:
57 |         print 'A character must be given as a command line argument for the ' \
58 |         'separator character.'
59 |         return
60 | 
61 |     flow = Flow()
62 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
63 |     output = flow.tsv_sink('pycascading_data/out')
64 | 
65 |     # Select the lines beginning with 'A', and save this intermediate result
66 |     # in the cache so that we can call the script several times with
67 |     # different separator characters
68 |     p = input | map_replace(find_lines_with_beginning('A'), 'line')
69 |     # Checkpoint the results from 'p' into a cache folder named 'line_begins'
70 |     # The caches are in the user's HDFS folder, under pycascading.cache/
71 |     p = flow.cache('line_begins') | p
72 |     # Everything goes to one reducer
73 |     p | group_by(Fields.VALUES, concat_all(sys.argv[1]), 'result') | output
74 | 
75 |     flow.run(num_reducers=1)
76 | 


--------------------------------------------------------------------------------
/examples/callback.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """
17 | Contrived example showing that you can pass functions as args to a UDF.
18 | Also shows how to use keyword args (just the way it's expected).
19 | 
20 | Thanks to ebernhardson.
21 | """
22 | 
23 | from pycascading.helpers import *
24 | 
25 | 
26 | def word_count_callback(value):
27 |     return len(value.split())
28 | 
29 | 
30 | @udf_map
31 | def word_count(tuple, inc, second_inc, callback=None):
32 |     return [inc + second_inc + callback(tuple.get(1)), tuple.get(1)]
33 | 
34 | 
35 | def main():
36 |     flow = Flow()
37 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
38 |     output = flow.tsv_sink('pycascading_data/out')
39 | 
40 |     p = input | map_replace(
41 |         word_count(100, second_inc=200, callback=word_count_callback),
42 |         ['word_count', 'line']) | output
43 | 
44 |     flow.run(num_reducers=1)
45 | 


--------------------------------------------------------------------------------
/examples/copy_data_to_hdfs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Run this on the Hadoop server to copy the data files needed
4 | # to run the PyCascading examples to HDFS
5 | hadoop fs -put pycascading_data pycascading_data
6 | 


--------------------------------------------------------------------------------
/examples/joins.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example showing the joining and splitting of tuple streams."""
17 | 
18 | 
19 | from pycascading.helpers import *
20 | 
21 | 
22 | @udf_map(produces=['ucase_lhs2', 'rhs2'])
23 | def upper_case(tuple):
24 |     """Return the upper case of the 'lhs2' column, and the 'rhs2' column"""
25 |     return [tuple.get('lhs2').upper(), tuple.get('rhs2')]
26 | 
27 | 
28 | def main():
29 |     flow = Flow()
30 |     lhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
31 |                                         [Integer, String]),
32 |                           'pycascading_data/lhs.txt'))
33 |     rhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
34 |                                         [Integer, String]),
35 |                           'pycascading_data/rhs.txt'))
36 |     output1 = flow.tsv_sink('pycascading_data/out1')
37 |     output2 = flow.tsv_sink('pycascading_data/out2')
38 | 
39 |     # Join on the first columns ('col1' for both) of lhs and rhs inputs
40 |     # We need to use declared_fields if the field names since the field names
41 |     # of the two pipes overlap
42 |     p = (lhs & rhs) | inner_join(['col1', 'col1'],
43 |                                  declared_fields=['lhs1', 'lhs2', 'rhs1', 'rhs2'])
44 | 
45 |     # Save the 2nd and 4th columns of p to output1
46 |     p | retain('lhs2', 'rhs2') | output1
47 | 
48 |     # Join on the upper-cased first column of p and the 2nd column of rhs,
49 |     # and save the output to output2
50 |     ((p | upper_case) & (rhs | retain('col2'))) | \
51 |     inner_join(['ucase_lhs2', 'col2']) | output2
52 | 
53 |     flow.run(num_reducers=2)
54 | 


--------------------------------------------------------------------------------
/examples/map_types.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example illustrating the different types of map operations.
17 | 
18 | In the output folders check the .pycascading_types and .pycascading_header
19 | files to see what the names of the fields were when the pipes were sinked.
20 | """
21 | 
22 | 
23 | from pycascading.helpers import *
24 | 
25 | 
26 | def main():
27 |     flow = Flow()
28 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
29 | 
30 |     out_folder = 'pycascading_data/maps/'
31 | 
32 |     @udf(produces='word')
33 |     def decorated_udf(tuple):
34 |         for word in tuple.get('line').split():
35 |             yield [word]
36 | 
37 |     def undecorated_udf(tuple):
38 |         for word in tuple.get('line').split():
39 |             yield [word]
40 | 
41 |     # This will create an output with one field called 'word', as the UDF
42 |     # was declared with a 'produces'
43 |     # In this case the swap swaps out the whole input tuple with the output
44 |     input | map_replace(decorated_udf) | \
45 |     flow.tsv_sink(out_folder + 'decorated_udf')
46 | 
47 |     # This will create an output with one unnamed field, but otherwise the
48 |     # same as the previous one
49 |     input | map_replace(undecorated_udf) | \
50 |     flow.tsv_sink(out_folder + 'undecorated_udf')
51 | 
52 |     # This will only replace the first ('line') field with the output of
53 |     # the map, but 'offset' will be retained
54 |     # Note that once we add an unnamed field, all field names will be lost
55 |     input | map_replace(1, undecorated_udf) | \
56 |     flow.tsv_sink(out_folder + 'undecorated_udf_with_input_args')
57 | 
58 |     # This will create one field only, 'word', just like the first example
59 |     input | map_replace(undecorated_udf, 'word') | \
60 |     flow.tsv_sink(out_folder + 'undecorated_udf_with_output_fields')
61 | 
62 |     # This one will add the new column, 'word', to all lines
63 |     input | map_add(decorated_udf) | \
64 |     flow.tsv_sink(out_folder + 'decorated_udf_all')
65 | 
66 |     # This produces the same output as the previous example
67 |     input | map_add(1, undecorated_udf, 'word') | \
68 |     flow.tsv_sink(out_folder + 'undecorated_udf_all')
69 | 
70 |     flow.run(num_reducers=1)
71 | 


--------------------------------------------------------------------------------
/examples/merge_streams.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Merge two streams together.
17 | 
18 | We are using Cascading GroupBy with multiple input streams to join them into
19 | one. The streams have to have the same field names and types.
20 | 
21 | If the column names are different, Cascading won't even build the flow,
22 | however if the column types differ, the flow is run but most likely will fail
23 | due to different types not being comparable when grouping.
24 | """
25 | 
26 | from pycascading.helpers import *
27 | 
28 | 
29 | def main():
30 |     flow = Flow()
31 |     stream1 = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
32 |                                         [Integer, String]),
33 |                           'pycascading_data/lhs.txt'))
34 |     stream2 = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
35 |                                         [Integer, String]),
36 |                           'pycascading_data/rhs.txt'))
37 |     output = flow.tsv_sink('pycascading_data/out')
38 | 
39 |     (stream1 & stream2) | group_by() | output
40 | 
41 |     flow.run(num_reducers=1)
42 | 


--------------------------------------------------------------------------------
/examples/pagerank.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Calculates PageRank for a given graph.
 17 | 
 18 | We assume that there are no dangling pages with no outgoing links.
 19 | """
 20 | 
 21 | import os
 22 | from pycascading.helpers import *
 23 | 
 24 | 
 25 | def test(graph_file, d, iterations):
 26 |     """This is the Python implementation of PageRank."""
 27 |     in_links = {}
 28 |     out_degree = {}
 29 |     pagerank = {}
 30 |     file = open(graph_file)
 31 |     for line in file:
 32 |         (source, dest) = line.rstrip().split()
 33 |         try:
 34 |             in_links[dest].add(source)
 35 |         except KeyError:
 36 |             in_links[dest] = set(source)
 37 |         try:
 38 |             out_degree[source] += 1
 39 |         except KeyError:
 40 |             out_degree[source] = 1
 41 |         pagerank[source] = 1.0
 42 |         pagerank[dest] = 1.0
 43 |     file.close()
 44 |     old_pr = pagerank
 45 |     new_pr = {}
 46 |     for iteration in xrange(0, iterations):
 47 |         for node in old_pr:
 48 |             new_pr[node] = (1 - d)
 49 |             try:
 50 |                 new_pr[node] += \
 51 |                 d * sum([old_pr[n] / out_degree[n] for n in in_links[node]])
 52 |             except KeyError:
 53 |                 pass
 54 |         tmp = old_pr
 55 |         old_pr = new_pr
 56 |         new_pr = tmp
 57 |     return old_pr
 58 | 
 59 | 
 60 | def main():
 61 |     """The PyCascading job."""
 62 |     # The damping factor
 63 |     d = 0.85
 64 |     # The number of iterations
 65 |     iterations = 5
 66 | 
 67 |     # The directed, unweighted graph in a space-separated file, in
 68 |     # <source_node> <destination_node> format
 69 |     graph_file = 'pycascading_data/graph.txt'
 70 | 
 71 |     graph_source = Hfs(TextDelimited(Fields(['from', 'to']), ' ',
 72 |                                      [String, String]), graph_file)
 73 | 
 74 |     out_links_file = 'pycascading_data/out/pagerank/out_links'
 75 |     pr_values_1 = 'pycascading_data/out/pagerank/iter1'
 76 |     pr_values_2 = 'pycascading_data/out/pagerank/iter2'
 77 | 
 78 |     # Some setup here: we'll need the ougoing degree of nodes, and we will
 79 |     # initialize the pageranks of nodes to 1.0
 80 |     flow = Flow()
 81 |     graph = flow.source(graph_source)
 82 | 
 83 |     # Count the number of outgoing links for every node that is a source,
 84 |     # and store it in a field called 'out_degree'
 85 |     graph | group_by('from') | native.count('out_degree') | \
 86 |     flow.binary_sink(out_links_file)
 87 | 
 88 |     # Initialize the pageranks of all nodes to 1.0
 89 |     # This file has fields 'node' and 'pagerank', and is stored to pr_values_1
 90 |     @udf
 91 |     def constant(tuple, c):
 92 |         """Just a field with a constant value c."""
 93 |         yield [c]
 94 |     @udf
 95 |     def both_nodes(tuple):
 96 |         """For each link returns both endpoints."""
 97 |         yield [tuple.get(0)]
 98 |         yield [tuple.get(1)]
 99 |     graph | map_replace(both_nodes, 'node') | \
100 |     native.unique(Fields.ALL) | map_add(constant(1.0), 'pagerank') | \
101 |     flow.binary_sink(pr_values_1)
102 | 
103 |     flow.run(num_reducers=1)
104 | 
105 |     pr_input = pr_values_1
106 |     pr_output = pr_values_2
107 |     for iteration in xrange(0, iterations):
108 |         flow = Flow()
109 | 
110 |         graph = flow.source(graph_source)
111 |         pageranks = flow.meta_source(pr_input)
112 |         out_links = flow.meta_source(out_links_file)
113 | 
114 |         # Decorate the graph's source nodes with their pageranks and the
115 |         # number of their outgoing links
116 |         # We could have joined graph & out_links outside of the loop, but
117 |         # in order to demonstrate joins with multiple streams, we do it here
118 |         p = (graph & pageranks & (out_links | rename('from', 'from_out'))) | \
119 |         inner_join(['from', 'node', 'from_out']) | \
120 |         rename(['pagerank', 'out_degree'], ['from_pagerank', 'from_out_degree']) | \
121 |         retain('from', 'from_pagerank', 'from_out_degree', 'to')
122 | 
123 |         # Distribute the sources' pageranks to their out-neighbors equally
124 |         @udf
125 |         def incremental_pagerank(tuple, d):
126 |             yield [d * tuple.get('from_pagerank') / tuple.get('from_out_degree')]
127 |         p = p | map_replace(['from', 'from_pagerank', 'from_out_degree'],
128 |                             incremental_pagerank(d), 'incr_pagerank') | \
129 |         rename('to', 'node') | retain('node', 'incr_pagerank')
130 | 
131 |         # Add the constant jump probability to all the pageranks that come
132 |         # from the in-links
133 |         p = (p & (pageranks | map_replace('pagerank', constant(1.0 - d), 'incr_pagerank'))) | group_by()
134 |         p = p | group_by('node', 'incr_pagerank', native.sum('pagerank'))
135 | 
136 |         if iteration == iterations - 1:
137 |             # Only store the final result in a TSV file
138 |             p | flow.tsv_sink(pr_output)
139 |         else:
140 |             # Store intermediate results in a binary format for faster IO
141 |             p | flow.binary_sink(pr_output)
142 | 
143 |         # Swap the input and output folders for the next iteration
144 |         tmp = pr_input
145 |         pr_input = pr_output
146 |         pr_output = tmp
147 | 
148 |         flow.run(num_reducers=1)
149 | 
150 |     print 'Results from PyCascading:', pr_input
151 |     os.system('cat %s/.pycascading_header %s/part*' % (pr_input, pr_input))
152 | 
153 |     print 'The test values:'
154 |     test_pr = test(graph_file, d, iterations)
155 |     print 'node\tpagerank'
156 |     for n in sorted(test_pr.iterkeys()):
157 |         print '%s\t%g' % (n, test_pr[n])
158 | 


--------------------------------------------------------------------------------
/examples/pycascading_data/graph.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 1 4
4 | 2 4
5 | 3 4
6 | 4 2
7 | 


--------------------------------------------------------------------------------
/examples/pycascading_data/lhs.txt:
--------------------------------------------------------------------------------
 1 | 1 a
 2 | 1 b
 3 | 1 c
 4 | 2 b
 5 | 2 c
 6 | 2 d
 7 | 3 c
 8 | 4 b
 9 | 4 c
10 | 4 d
11 | 5 a
12 | 5 b
13 | 5 e
14 | 


--------------------------------------------------------------------------------
/examples/pycascading_data/repeats.txt:
--------------------------------------------------------------------------------
 1 | a 1
 2 | b 2
 3 | c 3
 4 | b 2
 5 | a 1
 6 | a 1
 7 | c 3
 8 | b 2
 9 | a 1
10 | 


--------------------------------------------------------------------------------
/examples/pycascading_data/rhs.txt:
--------------------------------------------------------------------------------
 1 | 1 A
 2 | 1 B
 3 | 1 C
 4 | 2 B
 5 | 2 C
 6 | 2 D
 7 | 3 C
 8 | 4 B
 9 | 4 C
10 | 4 D
11 | 5 A
12 | 5 B
13 | 5 E
14 | 


--------------------------------------------------------------------------------
/examples/pycascading_data/town.txt:
--------------------------------------------------------------------------------
 1 | There's many a strong farmer
 2 | Whose heart would break in two,
 3 | If he could see the townland
 4 | That we are riding to;
 5 | Boughs have their fruit and blossom
 6 | At all times of the year;
 7 | Rivers are running over
 8 | With red beer and brown beer.
 9 | An old man plays the bagpipes
10 | In a golden and silver wood;
11 | Queens, their eyes blue like the ice,
12 | Are dancing in a crowd.
13 | 
14 | The little fox he murmured,
15 | 'O what of the world's bane?'
16 | The sun was laughing sweetly,
17 | The moon plucked at my rein;
18 | But the little red fox murmured,
19 | 'O do not pluck at his rein,
20 | He is riding to the townland
21 | That is the world's bane.'
22 | 
23 | When their hearts are so high
24 | That they would come to blows,
25 | They unhook their heavy swords
26 | From golden and silver boughs;
27 | But all that are killed in battle
28 | Awaken to life again.
29 | It is lucky that their story
30 | Is not known among men,
31 | For O, the strong farmers
32 | That would let the spade lie,
33 | Their hearts would be like a cup
34 | That somebody had drunk dry.
35 | 
36 | The little fox he murmured,
37 | 'O what of the world's bane?'
38 | The sun was laughing sweetly,
39 | The moon plucked at my rein;
40 | But the little red fox murmured,
41 | 'O do not pluck at his rein,
42 | He is riding to the townland
43 | That is the world's bane.'
44 | 
45 | Michael will unhook his trumpet
46 | From a bough overhead,
47 | And blow a little noise
48 | When the supper has been spread.
49 | Gabriel will come from the water
50 | With a fish-tail, and talk
51 | Of wonders that have happened
52 | On wet roads where men walk.
53 | And lift up an old horn
54 | Of hammered silver, and drink
55 | Till he has fallen asleep
56 | Upon the starry brink.
57 | 
58 | The little fox he murmured,
59 | 'O what of the world's bane?'
60 | The sun was laughing sweetly,
61 | The moon plucked at my rein;
62 | But the little red fox murmured.
63 | 'O do not pluck at his rein,
64 | He is riding to the townland
65 | That is the world's bane.'
66 | 


--------------------------------------------------------------------------------
/examples/python_fields.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example demonstrating the use of arbitrary Python (or Java) data in tuples.
17 | 
18 | The fields have to implement Serializable.
19 | 
20 | Currently these fields cannot be joined on, since we do not want to
21 | deserialize them for each comparison. We are also doing a join here to test
22 | the serializers.
23 | 
24 | Note that the serialization is currently done using the standard Java
25 | serialization framework, and thus is slow and produces large blobs. There are
26 | plans to use more efficient serializers in the future.
27 | """
28 | 
29 | 
30 | from pycascading.helpers import *
31 | 
32 | 
33 | @udf_map(produces=['col1', 'col2', 'info'])
34 | def add_python_data(tuple):
35 |     """This function returns a Python data structure as well."""
36 |     return [ tuple.get(0), tuple.get(1), [ 'first', { 'key' : 'value' } ]]
37 | 
38 | 
39 | def main():
40 |     flow = Flow()
41 |     lhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
42 |                                         [Integer, String]),
43 |                           'pycascading_data/lhs.txt'))
44 |     rhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
45 |                                         [Integer, String]),
46 |                           'pycascading_data/rhs.txt'))
47 | 
48 |     ((lhs | add_python_data()) & rhs) | inner_join(['col1', 'col1'],
49 |         declared_fields=['lhs1', 'lhs2', 'info', 'rhs1', 'rhs2']) | \
50 |         flow.tsv_sink('pycascading_data/out')
51 | 
52 |     flow.run(num_reducers=2)
53 | 


--------------------------------------------------------------------------------
/examples/reduce.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example showing how to use filters and buffers.
17 | 
18 | A buffer UDF is similar to the built-in Python reduce function. It takes a
19 | group of tuples that have been previously grouped by group_by, and yields an
20 | arbitrary number of new tuples for the group (it is most useful though to do
21 | some aggregation on the group). The tuples are fetched using an iterator.
22 | """
23 | 
24 | from pycascading.helpers import *
25 | 
26 | 
27 | @udf_filter
28 | def starts_with_letter(tuple, letter):
29 |     try:
30 |         return tuple.get(1)[0].upper() == letter
31 |     except:
32 |         return False
33 | 
34 | 
35 | @udf_map
36 | def word_count(tuple):
37 |     return [len(tuple.get(1).split()), tuple.get(1)]
38 | 
39 | 
40 | def main():
41 |     flow = Flow()
42 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
43 |     output = flow.tsv_sink('pycascading_data/out')
44 | 
45 |     p = input | filter_by(starts_with_letter('A')) | \
46 |     map_replace(word_count(), ['word_count', 'line'])
47 | 
48 |     @udf_buffer(produces=['word_count', 'count', 'first_chars'])
49 |     def count(group, tuples):
50 |         """Counts the number of tuples in the group, and also emits a string
51 |         that is the first character of the 'line' column repeated this many
52 |         times."""
53 |         c = 0
54 |         first_char = ''
55 |         for tuple in tuples:
56 |             c += 1
57 |             first_char += tuple.get('line')[0]
58 |         yield [group.get(0), c, first_char]
59 | 
60 |     p | group_by('word_count', count()) | output
61 | 
62 |     flow.run(num_reducers=2)
63 | 


--------------------------------------------------------------------------------
/examples/subassembly.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example demonstrating the use of predefined subassemblies.
17 | 
18 | Useful aggregators, subassemblies, pipes available in Cascading are imported
19 | into PyCascading by native.py
20 | """
21 | 
22 | from pycascading.helpers import *
23 | 
24 | 
25 | def main():
26 |     flow = Flow()
27 |     repeats = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ',
28 |                                             [String, Integer]),
29 |                               'pycascading_data/repeats.txt'))
30 |     output = flow.tsv_sink('pycascading_data/out')
31 | 
32 |     # This selects the distinct records considering all fields
33 |     repeats | native.unique(Fields.ALL) | output
34 | 
35 |     flow.run()
36 | 


--------------------------------------------------------------------------------
/examples/total_sort.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Simple word count example with reverse sorting of the words by frequency."""
17 | 
18 | from pycascading.helpers import *
19 | 
20 | 
21 | def main():
22 |     flow = Flow()
23 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
24 |     output = flow.tsv_sink('pycascading_data/out')
25 | 
26 |     @udf_map
27 |     def split_words(tuple):
28 |         for word in tuple.get(1).split():
29 |             yield [word]
30 | 
31 |     input | \
32 |     map_replace(split_words, 'word') | \
33 |     group_by('word') | \
34 |     native.count() | \
35 |     group_by(Fields.VALUES, sort_fields=['count'], reverse_order=True) | \
36 |     output
37 | 
38 |     flow.run(num_reducers=5)
39 | 


--------------------------------------------------------------------------------
/examples/udf_contexts.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Example showing how to pass in parameters to UDFs.
17 | 
18 | The context is serialized and shipped to where the UDFs are executed. A use
19 | case for example is to perform replicated joins on constant data.
20 | """
21 | 
22 | from pycascading.helpers import *
23 | 
24 | 
25 | def main():
26 |     flow = Flow()
27 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
28 |     output = flow.tsv_sink('pycascading_data/out')
29 | 
30 |     @udf_filter
31 |     def starts_with_letters(tuple, field, letters):
32 |         """Only let tuples through whose second field starts with a given letter.
33 | 
34 |         The set of acceptable initial letters is passed in the letters parameter,
35 |         and is defined at the time when we build the flow.
36 |         """
37 |         try:
38 |             return tuple.get(field)[0].upper() in letters
39 |         except:
40 |             return False
41 | 
42 |     # Retain only lines that start with an 'A' or 'T'
43 |     input | retain('line') | starts_with_letters(0, set(['A', 'T'])) | output
44 | 
45 |     flow.run(num_reducers=2)
46 | 


--------------------------------------------------------------------------------
/examples/word_count.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Simple word count example."""
17 | 
18 | from pycascading.helpers import *
19 | 
20 | 
21 | @udf_map(produces=['word'])
22 | def split_words(tuple):
23 |     """The function to split the line and return several new tuples.
24 | 
25 |     The tuple to operate on is passed in as the first parameter. We are
26 |     yielding the results in a for loop back. Each word becomes the only field
27 |     in a new tuple stream, and the string to be split is the 2nd field of the
28 |     input tuple.
29 |     """
30 |     for word in tuple.get(1).split():
31 |         yield [word]
32 | 
33 | 
34 | def main():
35 |     flow = Flow()
36 |     # The TextLine() scheme produces tuples where the first field is the 
37 |     # offset of the line in the file, and the second is the line as a string.
38 |     input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt'))
39 |     output = flow.tsv_sink('pycascading_data/out')
40 | 
41 |     input | split_words | group_by('word', native.count()) | output
42 | 
43 |     flow.run(num_reducers=2)
44 | 


--------------------------------------------------------------------------------
/java/build.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 | 	Copyright 2011 Twitter, Inc.
  3 | 	Licensed under the Apache License, Version 2.0 (the "License");
  4 | 	you may not use this file except in compliance with the License.
  5 | 	You may obtain a copy of the License at
  6 | 
  7 | 	http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | 	Unless required by applicable law or agreed to in writing, software
 10 | 	distributed under the License is distributed on an "AS IS" BASIS,
 11 | 	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | 	See the License for the specific language governing permissions and
 13 | 	limitations under the License.
 14 | -->
 15 | 
 16 | <project name="PyCascading" basedir="." default="all">
 17 | 
 18 | 	<property file="${basedir}/dependencies.properties" />
 19 | 
 20 | 	<!-- These need to be set to the locations of the appropriate frameworks -->
 21 | 	<property name="cascading.home" location="${cascading}" />
 22 | 	<property name="jython.home" location="${jython}" />
 23 | 	<property name="hadoop.home" location="${hadoop}" />
 24 | 
 25 | 	<!-- The location of the sources and build directories -->
 26 | 	<property name="java.src" location="${basedir}/java/src" />
 27 | 	<property name="build.dir" location="${basedir}/../build" />
 28 | 	<property name="build.classes" location="${build.dir}/classes" />
 29 | 	<property name="build.libs" location="${build.classes}/lib" />
 30 | 
 31 | 	<property name="python.dir" value="${basedir}/../python" />
 32 | 
 33 | 	<!-- Cascading specific properties -->
 34 | 	<property file="${cascading.home}/version.properties" />
 35 | 	<property name="cascading.release.version" value="${cascading.release.major}.${cascading.release.minor}" />
 36 | 
 37 | 	<!-- The location of external jars -->
 38 | 	<path id="java.classpath">
 39 |         <pathelement location="${build.classes}"/>
 40 | 		<path>
 41 | 			<fileset dir="${cascading.home}" includes="cascading-core-*.jar" />
 42 | 	        <fileset dir="${cascading.home}/lib" includes="jgrapht-jdk1.6-*.jar"/>
 43 | 		</path>
 44 | 		<path>
 45 | 			<fileset dir="${hadoop.home}" includes="hadoop-*-core.jar" />
 46 | 			<fileset dir="${hadoop.home}" includes="hadoop-core-*.jar" />
 47 | 		</path>
 48 | 		<path>
 49 | 			<fileset dir="${jython.home}" includes="jython.jar" />
 50 | 		</path>
 51 |     </path>
 52 | 
 53 | 	<!-- Jython specific properties -->
 54 | 	<property name="jython.libs" value="${jython.home}/Lib" />
 55 | 
 56 | 	<target name="init" description="Initialize the build directories">
 57 | 		<tstamp/>
 58 | 	  	<mkdir dir="${build.classes}"/>
 59 | 	</target>
 60 | 
 61 | 	<target name="compile" depends="init" description="Compile the Java source">
 62 |         <echo message="Cascading version: ${cascading.release.version}"/>
 63 | 		<javac destdir="${build.classes}" deprecation="off"
 64 | 			debug="on" debuglevel="lines,vars,source">
 65 | 			<src path="src" />
 66 |             <classpath refid="java.classpath"/>
 67 | 		</javac>
 68 | 	</target>
 69 | 
 70 | 	<target name="jar" depends="compile"
 71 | 		description="Creates a PyCascading jar with dependencies">
 72 | 		<!-- Copy Cascading & Jython jars -->
 73 | 		<copy todir="${build.libs}">
 74 | 			<fileset dir="${cascading.home}" includes="cascading-core-*.jar" />
 75 | 			<fileset dir="${cascading.home}" includes="cascading-xml-*.jar" />
 76 | 			<fileset dir="${cascading.home}/lib" includes="*.jar" />
 77 | 			<fileset dir="${cascading.home}/lib/xml" includes="*.jar" />
 78 | 			<fileset dir="${jython.home}" includes="jython.jar" />
 79 | 		</copy>
 80 | 
 81 | 		<jar jarfile="${build.dir}/pycascading.jar">
 82 | 			<fileset dir="${build.classes}" />
 83 | 
 84 | 			<manifest>
 85 | 				<!-- the project Main class, by default assumes Main -->
 86 | 				<attribute name="Main-Class" value="com.twitter.pycascading.Main" />
 87 | 			</manifest>
 88 | 		</jar>
 89 | 	</target>
 90 | 
 91 | 	<target name="tgz" description="Creates the Python PyCascading archive">
 92 | 		<!-- Apparently need to use .tgz for the archive, .tar.gz didn't work.
 93 | 		This file is going to be put in the Hadoop distributed cache, and if
 94 | 		the extension is .tar.gz, my installation didn't extract it. -->
 95 | 		<tar destfile="${build.dir}/pycascading.tgz" compression="gzip">
 96 | 			<tarfileset dir="${python.dir}" prefix="python" excludes="**/*.class" />
 97 | 			<tarfileset dir="${jython.libs}" prefix="python/Lib" excludes="**/*.class" />
 98 | 		</tar>
 99 | 	</target>
100 | 
101 | 	<target name="all" depends="jar,tgz"
102 | 		description="Creates a jar and tgz for job submission">
103 | 	</target>
104 | 
105 | 	<target name="clean">
106 | 		<delete dir="${build.dir}" />
107 | 	</target>
108 | 
109 | </project>
110 | 


--------------------------------------------------------------------------------
/java/dependencies.properties:
--------------------------------------------------------------------------------
 1 | # The folder where Cascading was downloaded to
 2 | # http://www.concurrentinc.com/downloads/
 3 | cascading=/opt/cascading-1.2.5-hadoop-0.19.2+
 4 | 
 5 | # At least Jython version 2.5.2 required
 6 | # Download from http://www.jython.org/downloads.html
 7 | jython=/opt/jython
 8 | 
 9 | # Hadoop's folder
10 | # Download from http://www.apache.org/dyn/closer.cgi/hadoop/common/
11 | hadoop=/opt/hadoop-0.20.203.0
12 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/CascadingAggregatorWrapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading;
16 | 
17 | import java.io.Serializable;
18 | 
19 | import cascading.flow.FlowProcess;
20 | import cascading.operation.Aggregator;
21 | import cascading.operation.AggregatorCall;
22 | import cascading.tuple.Fields;
23 | import cascading.tuple.TupleEntry;
24 | import cascading.tuple.TupleEntryCollector;
25 | 
26 | /**
27 |  * Wrapper for a Cascading Aggregator that calls a Python function.
28 |  * TODO: we don't really need this, as Buffers are just as good as Aggregators
29 |  * 
30 |  * @author Gabor Szabo
31 |  */
32 | @SuppressWarnings("rawtypes")
33 | public class CascadingAggregatorWrapper extends CascadingRecordProducerWrapper implements
34 |         Aggregator, Serializable {
35 |   private static final long serialVersionUID = -5110929817978998473L;
36 | 
37 |   public CascadingAggregatorWrapper() {
38 |     super();
39 |   }
40 | 
41 |   public CascadingAggregatorWrapper(Fields fieldDeclaration) {
42 |     super(fieldDeclaration);
43 |   }
44 | 
45 |   public CascadingAggregatorWrapper(int numArgs) {
46 |     super(numArgs);
47 |   }
48 | 
49 |   public CascadingAggregatorWrapper(int numArgs, Fields fieldDeclaration) {
50 |     super(numArgs, fieldDeclaration);
51 |   }
52 | 
53 |   @Override
54 |   public void start(FlowProcess flowProcess, AggregatorCall aggregatorCall) {
55 |     // TODO Auto-generated method stub
56 |     System.out.println("Aggregator start called");
57 |   }
58 | 
59 |   @Override
60 |   public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) {
61 |     TupleEntry group = aggregatorCall.getGroup();
62 |     TupleEntryCollector outputCollector = aggregatorCall.getOutputCollector();
63 | 
64 |     System.out.println("Aggregator called with group: " + group);
65 |   }
66 | 
67 |   @Override
68 |   public void complete(FlowProcess flowProcess, AggregatorCall aggregatorCall) {
69 |     // TODO Auto-generated method stub
70 |     System.out.println("Aggregator complete called");
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/CascadingBufferWrapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading;
16 | 
17 | import java.io.IOException;
18 | import java.io.ObjectInputStream;
19 | import java.io.Serializable;
20 | import java.util.Iterator;
21 | 
22 | import org.python.core.Py;
23 | 
24 | import cascading.flow.FlowProcess;
25 | import cascading.operation.Buffer;
26 | import cascading.operation.BufferCall;
27 | import cascading.tuple.Fields;
28 | import cascading.tuple.TupleEntry;
29 | import cascading.tuple.TupleEntryCollector;
30 | 
31 | /**
32 |  * Wrapper for a Cascading Buffer that calls a Python function.
33 |  * 
34 |  * @author Gabor Szabo
35 |  */
36 | @SuppressWarnings("rawtypes")
37 | public class CascadingBufferWrapper extends CascadingRecordProducerWrapper implements Buffer,
38 |         Serializable {
39 |   private static final long serialVersionUID = -3512295576396796360L;
40 | 
41 |   public CascadingBufferWrapper() {
42 |     super();
43 |   }
44 | 
45 |   public CascadingBufferWrapper(Fields fieldDeclaration) {
46 |     super(fieldDeclaration);
47 |   }
48 | 
49 |   public CascadingBufferWrapper(int numArgs) {
50 |     super(numArgs);
51 |   }
52 | 
53 |   public CascadingBufferWrapper(int numArgs, Fields fieldDeclaration) {
54 |     super(numArgs, fieldDeclaration);
55 |   }
56 | 
57 |   private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
58 |     setupArgs();
59 |   }
60 | 
61 |   public int getNumParameters() {
62 |     return super.getNumParameters() + 1;
63 |   }
64 | 
65 |   @Override
66 |   public void operate(FlowProcess flowProcess, BufferCall bufferCall) {
67 |     // TODO: if the Python buffer expects Python dicts or lists, then we need to
68 |     // convert the Iterator
69 |     @SuppressWarnings("unchecked")
70 |     Iterator<TupleEntry> arguments = bufferCall.getArgumentsIterator();
71 | 
72 |     // This gets called even when there are no tuples in the group after
73 |     // a GroupBy (see the Buffer javadoc). So we need to check if there are any
74 |     // valid tuples returned in the group.
75 |     if (arguments.hasNext()) {
76 |       TupleEntry group = bufferCall.getGroup();
77 |       TupleEntryCollector outputCollector = bufferCall.getOutputCollector();
78 | 
79 |       callArgs[0] = Py.java2py(group);
80 |       callArgs[1] = Py.java2py(arguments);
81 |       if (outputMethod == OutputMethod.COLLECTS) {
82 |         callArgs[2] = Py.java2py(outputCollector);
83 |         callFunction();
84 |       } else {
85 |         Object ret = callFunction();
86 |         collectOutput(outputCollector, ret);
87 |       }
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/CascadingFilterWrapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading;
16 | 
17 | import java.io.ObjectInputStream;
18 | import java.io.Serializable;
19 | 
20 | import org.python.core.Py;
21 | import org.python.core.PyObject;
22 | 
23 | import cascading.flow.FlowProcess;
24 | import cascading.operation.Filter;
25 | import cascading.operation.FilterCall;
26 | import cascading.tuple.Fields;
27 | 
28 | /**
29 |  * Wrapper for a Cascading Filter that calls a Python function.
30 |  * 
31 |  * @author Gabor Szabo
32 |  */
33 | @SuppressWarnings("rawtypes")
34 | public class CascadingFilterWrapper extends CascadingBaseOperationWrapper implements Filter,
35 |         Serializable {
36 |   private static final long serialVersionUID = -8825679328970045134L;
37 | 
38 |   public CascadingFilterWrapper() {
39 |     super();
40 |   }
41 | 
42 |   public CascadingFilterWrapper(Fields fieldDeclaration) {
43 |     // If we set it to anything other than Fields.ALL, Cascading complains
44 |     super(Fields.ALL);
45 |   }
46 | 
47 |   public CascadingFilterWrapper(int numArgs) {
48 |     super(numArgs);
49 |   }
50 | 
51 |   public CascadingFilterWrapper(int numArgs, Fields fieldDeclaration) {
52 |     super(numArgs, fieldDeclaration);
53 |   }
54 |   
55 |   public int getNumParameters() {
56 |     return 1;
57 |   }
58 |   
59 |   private void readObject(ObjectInputStream stream) {
60 |     setupArgs();
61 |   }
62 | 
63 |   @Override
64 |   public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
65 |     Object tuple = convertInput(filterCall.getArguments());
66 |     callArgs[0] = Py.java2py(tuple);
67 |     PyObject ret = callFunction();
68 |     return !Py.py2boolean(ret);
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/CascadingFunctionWrapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading;
16 | 
17 | import java.io.ObjectInputStream;
18 | import java.io.Serializable;
19 | 
20 | import org.python.core.Py;
21 | 
22 | import cascading.flow.FlowProcess;
23 | import cascading.operation.Function;
24 | import cascading.operation.FunctionCall;
25 | import cascading.operation.OperationCall;
26 | import cascading.tuple.Fields;
27 | import cascading.tuple.TupleEntryCollector;
28 | 
29 | /**
30 |  * Wrapper for a Cascading Function that calls a Python function.
31 |  * 
32 |  * @author Gabor Szabo
33 |  */
34 | @SuppressWarnings("rawtypes")
35 | public class CascadingFunctionWrapper extends CascadingRecordProducerWrapper implements Function,
36 |         Serializable {
37 |   private static final long serialVersionUID = -3512295576396796360L;
38 | 
39 |   public CascadingFunctionWrapper() {
40 |     super();
41 |   }
42 | 
43 |   public CascadingFunctionWrapper(Fields fieldDeclaration) {
44 |     super(fieldDeclaration);
45 |   }
46 | 
47 |   public CascadingFunctionWrapper(int numArgs) {
48 |     super(numArgs);
49 |   }
50 | 
51 |   public CascadingFunctionWrapper(int numArgs, Fields fieldDeclaration) {
52 |     super(numArgs, fieldDeclaration);
53 |   }
54 | 
55 |   /**
56 |    * We need to call setupArgs() from here, otherwise CascadingFunctionWrapper
57 |    * is not initialized yet if we call it from CascadingBaseOperationWrapper.
58 |    */
59 |   private void readObject(ObjectInputStream stream) {
60 |     setupArgs();
61 |   }
62 | 
63 |   @Override
64 |   public void prepare(FlowProcess flowProcess, OperationCall operationCall) {
65 |     super.prepare(flowProcess, operationCall);
66 |   }
67 | 
68 |   @Override
69 |   public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
70 |     Object inputTuple = convertInput(functionCall.getArguments());
71 |     TupleEntryCollector outputCollector = functionCall.getOutputCollector();
72 | 
73 |     callArgs[0] = Py.java2py(inputTuple);
74 |     if (outputMethod == OutputMethod.COLLECTS) {
75 |       // The Python function collects the output tuples itself into the output
76 |       // collector
77 |       callArgs[1] = Py.java2py(outputCollector);
78 |       callFunction();
79 |     } else {
80 |       // The Python function yields or returns records
81 |       Object ret = callFunction();
82 |       collectOutput(outputCollector, ret);
83 |     }
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/CascadingRecordProducerWrapper.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2011 Twitter, Inc.
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  * 
  7 |  * http://www.apache.org/licenses/LICENSE-2.0
  8 |  * 
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  */
 15 | package com.twitter.pycascading;
 16 | 
 17 | import java.io.Serializable;
 18 | 
 19 | import org.python.core.PyGenerator;
 20 | import org.python.core.PyNone;
 21 | import org.python.core.PyObject;
 22 | import org.python.core.PySequenceList;
 23 | 
 24 | import cascading.tuple.Fields;
 25 | import cascading.tuple.Tuple;
 26 | import cascading.tuple.TupleEntry;
 27 | import cascading.tuple.TupleEntryCollector;
 28 | 
 29 | /**
 30 |  * This class is the parent class for Cascading Functions and Buffers. It
 31 |  * essetially converts records coming from the Python function to tuples.
 32 |  * 
 33 |  * @author Gabor Szabo
 34 |  */
 35 | public class CascadingRecordProducerWrapper extends CascadingBaseOperationWrapper implements
 36 |         Serializable {
 37 |   private static final long serialVersionUID = -1198203231681047370L;
 38 | 
 39 |   // This is how the Python function returns the output tuples. It can add them
 40 |   // to the output collector right away, provide a generator to yield one or
 41 |   // more records, or return one record only. YIELDS_OR_RETURNS means that
 42 |   // PyCascading should determine automatically if it's a generator or a normal
 43 |   // function.
 44 |   public enum OutputMethod {
 45 |     COLLECTS, YIELDS, RETURNS, YIELDS_OR_RETURNS
 46 |   }
 47 | 
 48 |   // This is what the Python function returns: a Python list or a Cascading
 49 |   // tuple, or PyCascading can also figure it out automatically from the first
 50 |   // record returned.
 51 |   //
 52 |   // AUTO means that the type of the very first object returned from the
 53 |   // Python @map determines what type we are going to use.
 54 |   public enum OutputType {
 55 |     AUTO, PYTHON_LIST, TUPLE, TUPLEENTRY
 56 |   }
 57 | 
 58 |   protected OutputMethod outputMethod;
 59 |   protected OutputType outputType;
 60 | 
 61 |   public CascadingRecordProducerWrapper() {
 62 |     super();
 63 |   }
 64 | 
 65 |   public CascadingRecordProducerWrapper(Fields fieldDeclaration) {
 66 |     super(fieldDeclaration);
 67 |   }
 68 | 
 69 |   public CascadingRecordProducerWrapper(int numArgs) {
 70 |     super(numArgs);
 71 |   }
 72 | 
 73 |   public CascadingRecordProducerWrapper(int numArgs, Fields fieldDeclaration) {
 74 |     super(numArgs, fieldDeclaration);
 75 |   }
 76 | 
 77 |   public int getNumParameters() {
 78 |     return (outputMethod == OutputMethod.COLLECTS ? 2 : 1);
 79 |   }
 80 | 
 81 |   /**
 82 |    * Cast the returned or yielded array to a Tuple, and add it to the output
 83 |    * collector.
 84 |    * 
 85 |    * @param ret
 86 |    *          the object (list) returned from the Python function
 87 |    * @param outputCollector
 88 |    *          the output collector in which we place the Tuple
 89 |    * @param simpleCastIfTuple
 90 |    *          if we can simply cast ret to a Tuple, or have to call Jython's
 91 |    *          casting
 92 |    */
 93 |   private void castPythonObject(Object ret, TupleEntryCollector outputCollector,
 94 |           boolean simpleCastIfTuple) {
 95 |     if (outputType == OutputType.AUTO) {
 96 |       // We need to determine the type of the record now
 97 |       if (PySequenceList.class.isInstance(ret))
 98 |         outputType = OutputType.PYTHON_LIST;
 99 |       else if (Tuple.class.isInstance(ret))
100 |         outputType = OutputType.TUPLE;
101 |       else if (TupleEntry.class.isInstance(ret))
102 |         outputType = OutputType.TUPLEENTRY;
103 |       else
104 |         throw new RuntimeException(
105 |                 "Python function must return a list, Tuple, or TupleEnty. We got: "
106 |                         + ret.getClass());
107 |     }
108 |     if (outputType == OutputType.PYTHON_LIST)
109 |       // Convert the returned Python list to a tuple
110 |       // We can return both a Python (immutable) tuple and a list, so we
111 |       // need to use their common superclass, PySequenceList.
112 |       try {
113 |         outputCollector.add(new Tuple(((PySequenceList) ret).toArray()));
114 |       } catch (ClassCastException e) {
115 |         throw new RuntimeException(
116 |                 "Python function or generator must return a Python list, we got " + ret.getClass()
117 |                         + " instead");
118 |       }
119 |     else if (outputType == OutputType.TUPLE) {
120 |       try {
121 |         // For some reason yield doesn't wrap the object in a Jython
122 |         // container, but return does
123 |         if (simpleCastIfTuple)
124 |           outputCollector.add((Tuple) ret);
125 |         else
126 |           outputCollector.add((Tuple) ((PyObject) ret).__tojava__(Tuple.class));
127 |       } catch (ClassCastException e) {
128 |         throw new RuntimeException(
129 |                 "Python function or generator must return a Cascading Tuple, we got "
130 |                         + ret.getClass() + " instead");
131 |       }
132 |     } else {
133 |       try {
134 |         outputCollector.add((TupleEntry) ((PyObject) ret).__tojava__(TupleEntry.class));
135 |       } catch (ClassCastException e) {
136 |         throw new RuntimeException(
137 |                 "Python function or generator must return a Cascading TupleEntry, we got "
138 |                         + ret.getClass() + " instead");
139 |       }
140 |     }
141 |   }
142 | 
143 |   protected void collectOutput(TupleEntryCollector outputCollector, Object ret) {
144 |     if (ret == null)
145 |       return;
146 |     if (outputMethod == OutputMethod.YIELDS_OR_RETURNS) {
147 |       // Determine automatically whether the function yields or returns
148 |       outputMethod = (PyGenerator.class.isInstance(ret) ? OutputMethod.YIELDS
149 |               : OutputMethod.RETURNS);
150 |     }
151 |     if (outputMethod == OutputMethod.RETURNS) {
152 |       // We're simply returning records
153 |       // We can return None to produce no output
154 |       if (PyNone.class.isInstance(ret))
155 |         return;
156 |       castPythonObject(ret, outputCollector, false);
157 |     } else {
158 |       // We have a Python generator that yields records
159 |       for (Object record : (PyGenerator) ret) {
160 |         if (record != null) {
161 |           castPythonObject(record, outputCollector, true);
162 |         }
163 |       }
164 |     }
165 |   }
166 | 
167 |   public void setOutputMethod(OutputMethod outputMethod) {
168 |     this.outputMethod = outputMethod;
169 |   }
170 | 
171 |   public void setOutputType(OutputType outputType) {
172 |     this.outputType = outputType;
173 |   }
174 | }
175 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/Main.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.pycascading;
 2 | 
 3 | import java.util.Properties;
 4 | 
 5 | import org.python.util.PythonInterpreter;
 6 | 
 7 | public class Main {
 8 | 
 9 |   private static PythonInterpreter interpreter = null;
10 | 
11 |   /**
12 |    * This is the main method that gets passed to Hadoop, or executed in local
13 |    * mode.
14 |    * 
15 |    * @param args
16 |    *          the command line arguments
17 |    * @throws Exception
18 |    */
19 |   public static void main(String[] args) throws Exception {
20 |     Properties sysProps = System.getProperties();
21 |     Properties props = new Properties();
22 |     props.put("python.cachedir", sysProps.get("user.home") + "/.jython-cache");
23 |     props.put("python.cachedir.skip", "0");
24 |     PythonInterpreter.initialize(System.getProperties(), props, args);
25 |     getInterpreter().execfile(args[0]);
26 |   }
27 | 
28 |   /**
29 |    * Create and return the Python interpreter (singleton per JVM).
30 |    * 
31 |    * @return the Python interpreter
32 |    */
33 |   public static PythonInterpreter getInterpreter() {
34 |     if (interpreter == null) {
35 |       interpreter = new PythonInterpreter();
36 |     }
37 |     return interpreter;
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/MetaScheme.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2011 Twitter, Inc.
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  * 
  7 |  * http://www.apache.org/licenses/LICENSE-2.0
  8 |  * 
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  */
 15 | package com.twitter.pycascading;
 16 | 
 17 | import java.io.IOException;
 18 | import java.io.ObjectInputStream;
 19 | import java.io.ObjectOutputStream;
 20 | 
 21 | import org.apache.hadoop.conf.Configuration;
 22 | import org.apache.hadoop.fs.FSDataInputStream;
 23 | import org.apache.hadoop.fs.FSDataOutputStream;
 24 | import org.apache.hadoop.fs.FileSystem;
 25 | import org.apache.hadoop.fs.Path;
 26 | import org.apache.hadoop.mapred.JobConf;
 27 | import org.apache.hadoop.mapred.OutputCollector;
 28 | 
 29 | import cascading.scheme.Scheme;
 30 | import cascading.tap.Tap;
 31 | import cascading.tuple.Fields;
 32 | import cascading.tuple.Tuple;
 33 | import cascading.tuple.TupleEntry;
 34 | 
 35 | /**
 36 |  * A Cascading Scheme that stores header information for an output dataset. It
 37 |  * records all formatting information so that later on the tuple field names and
 38 |  * types can be reloaded without having to specify them explicitly.
 39 |  * 
 40 |  * It also stores the original scheme object so that at load time we don't have
 41 |  * to worry about that either.
 42 |  * 
 43 |  * @author Gabor Szabo
 44 |  */
 45 | public class MetaScheme extends Scheme {
 46 |   private static final long serialVersionUID = 8194175541999063797L;
 47 | 
 48 |   private static final String schemeFileName = ".pycascading_scheme";
 49 |   private static final String headerFileName = ".pycascading_header";
 50 |   private static final String typeFileName = ".pycascading_types";
 51 | 
 52 |   private Scheme scheme;
 53 |   private String outputPath;
 54 |   private boolean firstLine = true;
 55 |   private boolean typeFileToWrite = true;
 56 | 
 57 |   /**
 58 |    * Call this to get the original Cascading scheme that the data was written
 59 |    * in.
 60 |    * 
 61 |    * @param inputPath
 62 |    *          The path to where the scheme information was stored (normally the
 63 |    *          same as the path to the data)
 64 |    * @return The Cascading scheme that was used when the data was written.
 65 |    * @throws IOException
 66 |    */
 67 |   public static Scheme getSourceScheme(String inputPath) throws IOException {
 68 |     Path path = new Path(inputPath + "/" + schemeFileName);
 69 |     FileSystem fs = path.getFileSystem(new Configuration());
 70 |     try {
 71 |       FSDataInputStream file = fs.open(path);
 72 |       ObjectInputStream ois = new ObjectInputStream(file);
 73 |       Scheme scheme = (Scheme) ois.readObject();
 74 |       Fields fields = (Fields) ois.readObject();
 75 |       scheme.setSourceFields(fields);
 76 |       ois.close();
 77 |       file.close();
 78 |       return scheme;
 79 |     } catch (ClassNotFoundException e) {
 80 |       throw new IOException("Could not read PyCascading file header: " + inputPath + "/"
 81 |               + schemeFileName);
 82 |     }
 83 |   }
 84 | 
 85 |   /**
 86 |    * Returns the scheme that will store field information and the scheme in
 87 |    * outputPath. Additionally, a file called .pycascading_header will be
 88 |    * generated, which stores the names of the fields in a TAB-delimited format.
 89 |    * 
 90 |    * @param scheme
 91 |    *          The Cascading scheme to be used to store the data
 92 |    * @param outputPath
 93 |    *          Path were the metainformation about the scheme and field names
 94 |    *          should be stored
 95 |    * @return A scheme that can be used to sink the data into
 96 |    * @throws IOException
 97 |    */
 98 |   public static Scheme getSinkScheme(Scheme scheme, String outputPath) throws IOException {
 99 |     return new MetaScheme(scheme, outputPath);
100 |   }
101 | 
102 |   protected MetaScheme(Scheme scheme, String outputPath) throws IOException {
103 |     this.scheme = scheme;
104 |     this.outputPath = outputPath;
105 |   }
106 | 
107 |   @Override
108 |   public void sourceInit(Tap tap, JobConf conf) throws IOException {
109 |     // We're returning the original storage scheme, so this should not be called
110 |     // ever.
111 |   }
112 | 
113 |   @Override
114 |   public Tuple source(Object key, Object value) {
115 |     // This should never be called.
116 |     return null;
117 |   }
118 | 
119 |   @Override
120 |   public void sinkInit(Tap tap, JobConf conf) throws IOException {
121 |     scheme.sinkInit(tap, conf);
122 |   }
123 | 
124 |   @Override
125 |   public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException {
126 |     // TODO: do it so such that we don't need to specify /user/gabor if the path
127 |     // doesn't start with /
128 |     if (firstLine) {
129 |       Path path = new Path(outputPath + "/" + headerFileName);
130 |       FileSystem fs = path.getFileSystem(new Configuration());
131 |       try {
132 |         // We're trying to create the file by just one of the mappers/reducers,
133 |         // the one that can do it first
134 |         if (fs.createNewFile(path)) {
135 |           FSDataOutputStream stream = fs.create(path, true);
136 |           boolean firstField = true;
137 |           for (Comparable<?> field : tupleEntry.getFields()) {
138 |             if (firstField)
139 |               firstField = false;
140 |             else
141 |               stream.writeBytes("\t");
142 |             stream.writeBytes(field.toString());
143 |           }
144 |           stream.writeBytes("\n");
145 |           stream.close();
146 |         }
147 |       } catch (IOException e) {
148 |       }
149 | 
150 |       path = new Path(outputPath + "/" + schemeFileName);
151 |       fs = path.getFileSystem(new Configuration());
152 |       try {
153 |         if (fs.createNewFile(path)) {
154 |           FSDataOutputStream stream = fs.create(path, true);
155 |           ObjectOutputStream ostream = new ObjectOutputStream(stream);
156 |           ostream.writeObject(scheme);
157 |           ostream.writeObject(tupleEntry.getFields());
158 |           ostream.close();
159 |           stream.close();
160 |         }
161 |       } catch (IOException e) {
162 |       }
163 | 
164 |       firstLine = false;
165 |     }
166 | 
167 |     if (typeFileToWrite) {
168 |       Path path = new Path(outputPath + "/" + typeFileName);
169 |       FileSystem fs = path.getFileSystem(new Configuration());
170 |       try {
171 |         if (fs.createNewFile(path)) {
172 |           FSDataOutputStream stream = fs.create(path, true);
173 |           for (int i = 0; i < tupleEntry.size(); i++) {
174 |             Comparable fieldName = null;
175 |             if (tupleEntry.getFields().size() < tupleEntry.size()) {
176 |               // We don't have names for the fields
177 |               fieldName = "";
178 |             } else {
179 |               fieldName = tupleEntry.getFields().get(i) + "\t";
180 |             }
181 |             Object object = tupleEntry.getObject(i);
182 |             Class<?> objectClass = (object == null ? Object.class : object.getClass());
183 |             stream.writeBytes(fieldName + objectClass.getName() + "\n");
184 |           }
185 |           stream.close();
186 |         }
187 |       } catch (IOException e) {
188 |       }
189 |       typeFileToWrite = false;
190 |     }
191 |     scheme.sink(tupleEntry, outputCollector);
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/PythonEnvironment.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.pycascading;
 2 | 
 3 | import org.python.util.PythonInterpreter;
 4 | 
 5 | /**
 6 |  * This is the class that holds the Python environment running on a mapper or
 7 |  * reducer, including the Python interpreter.
 8 |  * 
 9 |  * @author Gabor Szabo
10 |  */
11 | public class PythonEnvironment {
12 |   private PythonInterpreter interpreter;
13 | 
14 |   /**
15 |    * Start a new Jython interpreter if it's not started yet.
16 |    * 
17 |    * @return the interpreter instance
18 |    */
19 |   public PythonInterpreter getPythonInterpreter() {
20 |     if (interpreter == null)
21 |       interpreter = new PythonInterpreter();
22 |     return interpreter;
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/PythonObjectInputStream.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.pycascading;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.ObjectInputStream;
 6 | 
 7 | import org.python.core.Py;
 8 | import org.python.core.PyObject;
 9 | import org.python.core.PyTuple;
10 | import org.python.util.PythonInterpreter;
11 | 
12 | /**
13 |  * When deserializing the job, this class reconstructs the Python functions
14 |  * given by their name and/or source.
15 |  * 
16 |  * @author Gabor Szabo
17 |  */
18 | public class PythonObjectInputStream extends ObjectInputStream {
19 | 
20 |   private PythonInterpreter interpreter;
21 | 
22 |   public PythonObjectInputStream(InputStream in, PythonInterpreter interpreter) throws IOException {
23 |     super(in);
24 |     this.interpreter = interpreter;
25 |     enableResolveObject(true);
26 |   }
27 | 
28 |   @Override
29 |   protected Object resolveObject(Object obj) throws IOException {
30 |     // This method will reconstruct the PyFunction based on its name or its
31 |     // source if it was a closure
32 |     if (obj instanceof SerializedPythonFunction) {
33 |       PyTuple serializedFunction = ((SerializedPythonFunction) obj).getSerializedFunction();
34 |       String functionType = (String) serializedFunction.get(0);
35 |       String functionName = (String) serializedFunction.get(3);
36 |       PyObject function = null;
37 |       if ("global".equals(functionType)) {
38 |         function = interpreter.get(functionName);
39 |       } else if ("closure".equals(functionType)) {
40 |         interpreter.exec((String) serializedFunction.get(4));
41 |         function = interpreter.get(functionName);
42 |       }
43 |       return function;
44 |     } else
45 |       return obj;
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/PythonObjectOutputStream.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.pycascading;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.ObjectOutputStream;
 5 | import java.io.OutputStream;
 6 | 
 7 | import org.python.core.Py;
 8 | import org.python.core.PyFunction;
 9 | import org.python.core.PyNone;
10 | import org.python.core.PyObject;
11 | import org.python.core.PyTuple;
12 | 
13 | /**
14 |  * This class replaces every function object with a pointer to its name and/or
15 |  * source, so that we can reconstruct the function when deserializing. We need
16 |  * to do it this way as PyFunctions cannot be serialized (some nested Jython
17 |  * objects don't implement Serializable).
18 |  * 
19 |  * @author Gabor Szabo
20 |  */
21 | public class PythonObjectOutputStream extends ObjectOutputStream {
22 | 
23 |   private PyFunction callBack;
24 | 
25 |   public PythonObjectOutputStream(OutputStream out, PyFunction callBack) throws IOException {
26 |     super(out);
27 |     this.callBack = callBack;
28 |     enableReplaceObject(true);
29 |   }
30 | 
31 |   @Override
32 |   protected Object replaceObject(Object obj) throws IOException {
33 |     if (obj instanceof PyFunction) {
34 |       PyObject replaced = callBack.__call__((PyObject) obj);
35 |       if (!(replaced instanceof PyNone)) {
36 |         return new SerializedPythonFunction((PyFunction) obj, (PyTuple) replaced);
37 |       }
38 |     }
39 |     return obj;
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/SelectFields.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading;
16 | 
17 | import java.io.Serializable;
18 | 
19 | import cascading.flow.FlowProcess;
20 | import cascading.operation.BaseOperation;
21 | import cascading.operation.Function;
22 | import cascading.operation.FunctionCall;
23 | import cascading.operation.OperationCall;
24 | import cascading.tuple.Fields;
25 | import cascading.tuple.Tuple;
26 | import cascading.tuple.TupleEntry;
27 | import cascading.tuple.TupleEntryCollector;
28 | 
29 | /**
30 |  * Simple Cascading function that keeps the specified fields only in the tuple
31 |  * stream.
32 |  * 
33 |  * @author Gabor Szabo
34 |  */
35 | public class SelectFields extends BaseOperation implements Function, Serializable {
36 |   private static final long serialVersionUID = -6859909716154224842L;
37 | 
38 |   private Fields filteredFields;
39 | 
40 |   public SelectFields(Fields filteredFields) {
41 |     super(filteredFields);
42 |     this.filteredFields = filteredFields;
43 |   }
44 | 
45 |   @Override
46 |   public void prepare(FlowProcess flowProcess, OperationCall operationCall) {
47 |     super.prepare(flowProcess, operationCall);
48 |   }
49 | 
50 |   @Override
51 |   public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
52 |     TupleEntry inputTuple = functionCall.getArguments();
53 |     TupleEntryCollector outputCollector = functionCall.getOutputCollector();
54 |     Tuple outputTuple = new Tuple();
55 | 
56 |     for (Comparable field : filteredFields) {
57 |       // We cannot use inputTuple.get(...) here, as that tries to convert
58 |       // the field value to a Comparable. In case we have a complex Python
59 |       // type as a field, that won't work.
60 |       outputTuple.add(inputTuple.getObject(field));
61 |     }
62 |     outputCollector.add(outputTuple);
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/SerializedPythonFunction.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | 
16 | /*
17 |  * This is a class the helps in serializing Jython functions. It seems that Jython
18 |  * functions cannot be serialized, because on the remote end a Jython interpreter
19 |  * has to also be invoked that can interpret the function.
20 |  *
21 |  * Thus when deserializing, we need to start a Jython interpreter and read the
22 |  * source file where the function was defined in the first place. This also means
23 |  * that we cannot use lambda functions as these cannot be referred to by name.
24 |  * Referring to functions by name is important as it's the function's name and
25 |  * source file that is sent through when serializing.
26 |  *
27 |  * It only works with Jython >= 2.5.2 because of a previous bug with serializing
28 |  * PyCode (http://bugs.jython.org/issue1601)
29 |  * Still I need to use a custom class loader, because there's a field in PyCode
30 |  * whose class is called "org.python.pycode._pyx0" but such a class does not exist.
31 |  *
32 |  * When invoking a function, the globals are not restored for that function. Thus
33 |  * for instance imports of Tuples etc. need to be done within the function. I tried
34 |  * to serialize the globals together with func_code, but org.python.core.packagecache.SysPackageManager
35 |  * in Jython is not serializable, and it is apparently appears in the globals. Tried
36 |  * to recompile Jython from sources, but there're too many external libraries missing.
37 |  *
38 |  * Unortunately Cascading serializes Function objects, but Jython cannot
39 |  * serialize PyFunctions due to bugs. But Jython 2.5.2 can serialize
40 |  * func_codes, so we work it around with that and saving the globals
41 |  * separately in a static variable.
42 |  */
43 | 
44 | package com.twitter.pycascading;
45 | 
46 | import java.io.IOException;
47 | import java.io.ObjectInputStream;
48 | import java.io.ObjectOutputStream;
49 | import java.io.Serializable;
50 | 
51 | import org.python.core.PyFunction;
52 | import org.python.core.PyObject;
53 | import org.python.core.PyTuple;
54 | 
55 | /**
56 |  * Class that is primarily responsible for serializing and deserializing a
57 |  * Jython function. It does this by storing the name of the function and
58 |  * reloading the interpreter and source where the function was defined when it
59 |  * becomes necessary to deserialize.
60 |  * 
61 |  * @author Gabor Szabo
62 |  */
63 | public class SerializedPythonFunction implements Serializable {
64 |   private static final long serialVersionUID = 4944819638591252128L;
65 | 
66 |   private PyObject pythonFunction;
67 |   private PyTuple serializedFunction;
68 | 
69 |   /**
70 |    * This constructor is necessary for the deserialization.
71 |    */
72 |   public SerializedPythonFunction() {
73 |   }
74 | 
75 |   public SerializedPythonFunction(PyFunction function, PyTuple serializedReturn) {
76 |     serializedFunction = serializedReturn;
77 |     pythonFunction = function;
78 |   }
79 | 
80 |   private void writeObject(ObjectOutputStream stream) throws IOException {
81 |     stream.writeObject(serializedFunction);
82 |   }
83 | 
84 |   private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
85 |     serializedFunction = (PyTuple) stream.readObject();
86 |   }
87 | 
88 |   public PyObject getPythonFunction() {
89 |     return pythonFunction;
90 |   }
91 | 
92 |   public PyTuple getSerializedFunction() {
93 |     return serializedFunction;
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/TemporaryHdfs.java:
--------------------------------------------------------------------------------
  1 | package com.twitter.pycascading;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Random;
  5 | 
  6 | import org.apache.hadoop.conf.Configuration;
  7 | import org.apache.hadoop.fs.FileSystem;
  8 | import org.apache.hadoop.fs.Path;
  9 | 
 10 | import cascading.flow.Flow;
 11 | import cascading.flow.FlowListener;
 12 | 
 13 | public class TemporaryHdfs implements FlowListener {
 14 |   private boolean tmpDirCreated = false;
 15 |   private String tmpDir;
 16 | 
 17 |   @Override
 18 |   public void onStarting(Flow flow) {
 19 |   }
 20 | 
 21 |   @Override
 22 |   public void onStopping(Flow flow) {
 23 |     removeTmpDir();
 24 |   }
 25 | 
 26 |   @Override
 27 |   public void onCompleted(Flow flow) {
 28 |     removeTmpDir();
 29 |   }
 30 | 
 31 |   @Override
 32 |   public boolean onThrowable(Flow flow, Throwable throwable) {
 33 |     removeTmpDir();
 34 |     throwable.printStackTrace();
 35 |     return false;
 36 |   }
 37 | 
 38 |   private String getRandomFileName() {
 39 |     String name = "";
 40 |     Random rnd = new Random();
 41 |     for (int i = 0; i < 6; i++) {
 42 |       name += (char) ((int) 'a' + rnd.nextInt((int) 'z' - (int) 'a'));
 43 |     }
 44 |     return name;
 45 |   }
 46 | 
 47 |   /**
 48 |    * Create a temporary folder on HDFS. The folder will be deleted after
 49 |    * execution or on an exception.
 50 |    * 
 51 |    * @param conf
 52 |    *          the jobconf
 53 |    * @throws IOException
 54 |    */
 55 |   String createTmpFolder(Configuration conf) throws IOException {
 56 |     // Only fs.default.name and hadoop.tmp.dir are defined at the time of the
 57 |     // job initialization, we cannot use mapreduce.job.dir, mapred.working.dir,
 58 |     // or mapred.job.id
 59 |     // Possibly use Hfs.getTempDir later from Cascading.
 60 |     // In tmpDir, I cannot put a / in between the two variables, otherwise
 61 |     // Hadoop will fail to copy the archive to the temporary folder
 62 |     tmpDir = conf.get("fs.default.name") + conf.get("hadoop.tmp.dir");
 63 |     tmpDir = tmpDir + "/" + "pycascading-" + getRandomFileName();
 64 |     Path path = new Path(tmpDir);
 65 |     FileSystem fs = path.getFileSystem(new Configuration());
 66 |     fs.mkdirs(path);
 67 |     tmpDirCreated = true;
 68 |     return tmpDir;
 69 |   }
 70 | 
 71 |   /**
 72 |    * Removes the temporary folder we created.
 73 |    */
 74 |   private void removeTmpDir() {
 75 |     if (tmpDirCreated) {
 76 |       Path path = new Path(tmpDir);
 77 |       try {
 78 |         FileSystem fs = path.getFileSystem(new Configuration());
 79 |         fs.delete(path, true);
 80 |       } catch (IOException e) {
 81 |         e.printStackTrace();
 82 |       }
 83 |     }
 84 |   }
 85 | 
 86 |   private String getExtension(String path) {
 87 |     int i = path.lastIndexOf('.');
 88 |     return (i >= 0 ? path.substring(i, path.length()) : "");
 89 |   }
 90 | 
 91 |   /**
 92 |    * Copies a local file to HDFS, which is used as the distributed cache. The
 93 |    * distribute cache basically just takes this HDFS folder, and copies its
 94 |    * contents to the local disks for the mappers/reducers. Also, if the file is
 95 |    * a compressed archive, it will extract it locally. We generate a random file
 96 |    * name for the destination, but keep the extension so that zip and tgz
 97 |    * archives are recognized.
 98 |    * 
 99 |    * @param source
100 |    *          the path to the local file to be distributed
101 |    * @return the path to the HDFS file
102 |    * @throws IOException
103 |    *           if the copy was unsuccessful
104 |    */
105 |   public String copyFromLocalFileToHDFS(String source) throws IOException {
106 |     Path src = new Path(source);
107 |     String destName = tmpDir + "/" + getRandomFileName() + getExtension(source);
108 |     Path dest = new Path(destName);
109 |     FileSystem fs = dest.getFileSystem(new Configuration());
110 |     fs.copyFromLocalFile(src, dest);
111 |     return destName;
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/Util.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2011 Twitter, Inc.
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  * 
  7 |  * http://www.apache.org/licenses/LICENSE-2.0
  8 |  * 
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  */
 15 | package com.twitter.pycascading;
 16 | 
 17 | import java.io.IOException;
 18 | import java.net.URISyntaxException;
 19 | import java.util.Map;
 20 | import java.util.Properties;
 21 | 
 22 | import org.apache.hadoop.conf.Configuration;
 23 | 
 24 | import cascading.flow.Flow;
 25 | import cascading.flow.FlowConnector;
 26 | import cascading.flow.FlowListener;
 27 | import cascading.pipe.Pipe;
 28 | import cascading.tap.Tap;
 29 | 
 30 | /**
 31 |  * Helper cass that sets up the MR environment and runs a Cascading Flow.
 32 |  * 
 33 |  * @author Gabor Szabo
 34 |  */
 35 | public class Util {
 36 |   // http://www.velocityreviews.com/forums/t147526-how-to-get-jar-file-name.html
 37 |   /**
 38 |    * Get the temporary folder where the job jar was extracted to by Hadoop.
 39 |    * 
 40 |    * TODO: This only works if we distribute PyCascading as classes. If I switch
 41 |    * to using jars, I need to remove the last part of the path which is the jar
 42 |    * file.
 43 |    * 
 44 |    * @return the temporary folder with the contents of the job jar
 45 |    */
 46 |   public static String getJarFolder() {
 47 |     try {
 48 |       return Util.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath();
 49 |     } catch (URISyntaxException e) {
 50 |       throw new RuntimeException("Could not get temporary job folder");
 51 |     }
 52 |   }
 53 | 
 54 |   /**
 55 |    * Get the Cascading jar file on the local file system.
 56 |    * 
 57 |    * @return the file location on the Hadoop worker for the Cascading jar
 58 |    */
 59 |   public static String getCascadingJar() {
 60 |     try {
 61 |       return cascading.pipe.Pipe.class.getProtectionDomain().getCodeSource().getLocation().toURI()
 62 |               .getPath();
 63 |     } catch (URISyntaxException e) {
 64 |       throw new RuntimeException("Could not get the location of the Cascading jar");
 65 |     }
 66 |   }
 67 | 
 68 |   /**
 69 |    * We use the "pycascading.root" Java system property to store the location of
 70 |    * the Python sources for PyCascading. This is only used in local mode. This
 71 |    * is needed so that we know where to set the import path when we start up the
 72 |    * mappers and reducers.
 73 |    * 
 74 |    * @param root
 75 |    *          the location of the PyCascading sources on the local file system
 76 |    */
 77 |   public static void setPycascadingRoot(String root) {
 78 |     System.setProperty("pycascading.root", root);
 79 |   }
 80 | 
 81 |   public static void run(int numReducers, Map<String, Object> config, Map<String, Tap> sources,
 82 |           Map<String, Tap> sinks, Pipe... tails) throws IOException, URISyntaxException {
 83 |     // String strClassPath = System.getProperty("java.class.path");
 84 |     // System.out.println("Classpath is " + strClassPath);
 85 | 
 86 |     Properties properties = new Properties();
 87 |     properties.put("mapred.reduce.tasks", numReducers);
 88 |     // Set this to change the default block size that is routed to one mapper
 89 |     // It won't help if the files are smaller than this as each file will go to
 90 |     // one mapper
 91 |     // properties.put("mapred.min.split.size", 20 * 1024 * 1024 * 1024L);
 92 |     // properties.put("mapred.map.tasks", 4000);
 93 |     // So that Thrift classes can be serialized
 94 |     // We need to add WritableSerialization otherwise sometimes Cascading and
 95 |     // Hadoop don't pick it up, and BigInteger serializations fail
 96 |     // See https://github.com/twitter/pycascading/issues/2
 97 |     // TODO: find the reason for this
 98 |     properties.put("io.serializations",
 99 |             "com.twitter.pycascading.bigintegerserialization.BigIntegerSerialization,"
100 |                     + "org.apache.hadoop.io.serializer.WritableSerialization,"
101 |                     + "com.twitter.pycascading.pythonserialization.PythonSerialization");
102 |     properties.put("mapred.jobtracker.completeuserjobs.maximum", 50000);
103 |     properties.put("mapred.input.dir.recursive", "true");
104 | 
105 |     // Set the running mode in the jobconf so that the mappers/reducers can
106 |     // easily check this.
107 |     String runningMode = (String) config.get("pycascading.running_mode");
108 |     properties.setProperty("pycascading.running_mode", runningMode);
109 |     properties.setProperty("pycascading.main_file", (String) config.get("pycascading.main_file"));
110 | 
111 |     Configuration conf = new Configuration();
112 |     TemporaryHdfs tempDir = null;
113 |     if ("hadoop".equals(runningMode)) {
114 |       tempDir = new TemporaryHdfs();
115 |       // We put the files to be distributed into the distributed cache
116 |       // The pycascading.distributed_cache.archives variable was set by
117 |       // bootstrap.py, based on the command line parameters where we specified
118 |       // the PyCascading & source archives
119 |       Object archives = config.get("pycascading.distributed_cache.archives");
120 |       if (archives != null) {
121 |         tempDir = new TemporaryHdfs();
122 |         String tempDirLocation = tempDir.createTmpFolder(conf);
123 |         String dests = null;
124 |         for (String archive : (Iterable<String>) archives) {
125 |           String dest = tempDir.copyFromLocalFileToHDFS(archive);
126 |           dests = (dests == null ? dest : dests + "," + dest);
127 |         }
128 |         // Set the distributed cache to the files we just copied to HDFS
129 |         //
130 |         // This is an ugly hack, we should use DistributedCache.
131 |         // DistributedCache however operates on a JobConf, and since
132 |         // Cascading expects a Map, we cannot directly pass
133 |         // in the parameters set into a JobConf.
134 |         // TODO: see if a later version of Cascading can update its properties
135 |         // using a JobConf
136 |         properties.setProperty("mapred.cache.archives", dests);
137 |         // This creates a symlink for each of the mappers/reducers to the
138 |         // localized files, instead of copying them for each one. This way we
139 |         // reduce the overhead for copying on one worker machine.
140 |         // TODO: see the one just above
141 |         properties.setProperty("mapred.create.symlink", "yes");
142 |       }
143 |     }
144 | 
145 |     FlowConnector.setApplicationJarClass(properties, Main.class);
146 |     FlowConnector flowConnector = new FlowConnector(properties);
147 |     Flow flow = flowConnector.connect(sources, sinks, tails);
148 |     if ("hadoop".equals(runningMode)) {
149 |       try {
150 |         flow.addListener(tempDir);
151 |       } catch (Exception e) {
152 |         e.printStackTrace();
153 |       }
154 |     } else {
155 |       try {
156 |         flow.addListener(new FlowListener() {
157 | 
158 |           @Override
159 |           public void onStarting(Flow flow) {
160 |           }
161 | 
162 |           @Override
163 |           public void onStopping(Flow flow) {
164 |           }
165 | 
166 |           @Override
167 |           public void onCompleted(Flow flow) {
168 |           }
169 | 
170 |           @Override
171 |           public boolean onThrowable(Flow flow, Throwable throwable) {
172 |             throwable.printStackTrace();
173 |             return false;
174 |           }
175 |         });
176 |       } catch (Exception e) {
177 |         e.printStackTrace();
178 |       }
179 |     }
180 |     flow.complete();
181 |   }
182 | }
183 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.bigintegerserialization;
16 | 
17 | import java.io.DataInputStream;
18 | import java.io.IOException;
19 | import java.io.Serializable;
20 | import java.math.BigInteger;
21 | import java.util.Comparator;
22 | 
23 | import org.apache.hadoop.io.WritableUtils;
24 | 
25 | import cascading.tuple.StreamComparator;
26 | import cascading.tuple.hadoop.BufferedInputStream;
27 | 
28 | /**
29 |  * Cascading in-stream comparator for Java BigIntegers.
30 |  * 
31 |  * @author Gabor Szabo
32 |  */
33 | public class BigIntegerComparator implements StreamComparator<BufferedInputStream>,
34 |         Comparator<BigInteger>, Serializable {
35 |   private static final long serialVersionUID = 3846289449409826723L;
36 | 
37 |   public BigIntegerComparator(Class<BigInteger> type) {
38 |   }
39 | 
40 |   public int compare(BufferedInputStream lhsStream, BufferedInputStream rhsStream) {
41 |     try {
42 |       DataInputStream inLeft = new DataInputStream(lhsStream);
43 |       DataInputStream inRight = new DataInputStream(rhsStream);
44 | 
45 |       long lhs = WritableUtils.readVLong(inLeft);
46 |       long rhs = WritableUtils.readVLong(inRight);
47 | 
48 |       if (lhs < rhs)
49 |         return -1;
50 |       else if (lhs > rhs)
51 |         return 1;
52 |       else
53 |         return 0;
54 |     } catch (IOException ioe) {
55 |       throw new RuntimeException(ioe);
56 |     }
57 |   }
58 | 
59 |   public int compare(BigInteger o1, BigInteger o2) {
60 |     return o1.compareTo(o2);
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerDeserializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.bigintegerserialization;
16 | 
17 | import java.io.DataInputStream;
18 | import java.io.IOException;
19 | import java.io.InputStream;
20 | import java.math.BigInteger;
21 | 
22 | import org.apache.hadoop.io.WritableUtils;
23 | import org.apache.hadoop.io.serializer.Deserializer;
24 | 
25 | /**
26 |  * Hadoop Deserializer for Java BigIntegers.
27 |  * 
28 |  * @author Gabor Szabo
29 |  */
30 | public class BigIntegerDeserializer implements Deserializer<BigInteger> {
31 |   private DataInputStream in;
32 | 
33 |   public BigIntegerDeserializer(Class<BigInteger> c) {
34 |   }
35 | 
36 |   public void open(InputStream inStream) throws IOException {
37 |     in = new DataInputStream(inStream);
38 |   }
39 | 
40 |   public BigInteger deserialize(BigInteger i) throws IOException {
41 |     return BigInteger.valueOf(WritableUtils.readVLong(in));
42 |   }
43 | 
44 |   public void close() throws IOException {
45 |     if (in != null) {
46 |       in.close();
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerSerialization.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.bigintegerserialization;
16 | 
17 | import cascading.tuple.Comparison;
18 | 
19 | import java.math.BigInteger;
20 | import java.util.Comparator;
21 | import org.apache.hadoop.io.serializer.Deserializer;
22 | import org.apache.hadoop.io.serializer.Serialization;
23 | import org.apache.hadoop.io.serializer.Serializer;
24 | 
25 | /**
26 |  * Hadoop Serialization class for Java BigIntegers.
27 |  * 
28 |  * @author Gabor Szabo
29 |  */
30 | public class BigIntegerSerialization implements Serialization<BigInteger>, Comparison<BigInteger> {
31 | 
32 |   public boolean accept(Class<?> c) {
33 |     boolean ret = BigInteger.class.isAssignableFrom(c);
34 |     return ret;
35 |   }
36 | 
37 |   public Deserializer<BigInteger> getDeserializer(Class<BigInteger> c) {
38 |     return new BigIntegerDeserializer(c);
39 |   }
40 | 
41 |   public Serializer<BigInteger> getSerializer(Class<BigInteger> c) {
42 |     return new BigIntegerSerializer();
43 |   }
44 | 
45 |   public Comparator<BigInteger> getComparator(Class<BigInteger> type) {
46 |     return new BigIntegerComparator(type);
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerSerializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.bigintegerserialization;
16 | 
17 | import java.io.DataOutputStream;
18 | import java.io.IOException;
19 | import java.io.OutputStream;
20 | import java.math.BigInteger;
21 | 
22 | import org.apache.hadoop.io.WritableUtils;
23 | import org.apache.hadoop.io.serializer.Serializer;
24 | 
25 | /**
26 |  * Hadoop Serializer for Java BigIntegers.
27 |  * 
28 |  * @author Gabor Szabo
29 |  */
30 | public class BigIntegerSerializer implements Serializer<BigInteger> {
31 |   private DataOutputStream out;
32 | 
33 |   public void open(OutputStream outStream) throws IOException {
34 |     out = new DataOutputStream(outStream);
35 |   }
36 | 
37 |   public void serialize(BigInteger i) throws IOException {
38 |     WritableUtils.writeVLong(out, i.longValue());
39 |   }
40 | 
41 |   public void close() throws IOException {
42 |     if (out != null) {
43 |       out.close();
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/pythonserialization/PythonDeserializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.pythonserialization;
16 | 
17 | import java.io.DataInputStream;
18 | import java.io.IOException;
19 | import java.io.InputStream;
20 | import java.io.ObjectInputStream;
21 | 
22 | import org.apache.hadoop.io.serializer.Deserializer;
23 | import org.python.core.PyObject;
24 | 
25 | /**
26 |  * Hadoop Deserializer for Python objects. It works, but it's slow so do not use
27 |  * in serious production.
28 |  * 
29 |  * @author Gabor Szabo
30 |  */
31 | public class PythonDeserializer implements Deserializer<PyObject> {
32 |   private DataInputStream inStream;
33 | 
34 |   public PythonDeserializer(Class<PyObject> c) {
35 |   }
36 | 
37 |   public void open(InputStream inStream) throws IOException {
38 |     if (inStream instanceof DataInputStream)
39 |       this.inStream = (DataInputStream) inStream;
40 |     else
41 |       this.inStream = new DataInputStream(inStream);
42 |   }
43 | 
44 |   public PyObject deserialize(PyObject i) throws IOException {
45 |     try {
46 |       ObjectInputStream in = new ObjectInputStream(inStream);
47 |       PyObject ret = (PyObject) in.readObject();
48 |       return ret;
49 |     } catch (ClassNotFoundException e) {
50 |       throw new IOException("Jython class not found");
51 |     }
52 |   }
53 | 
54 |   public void close() throws IOException {
55 |     if (inStream != null) {
56 |       inStream.close();
57 |       inStream = null;
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/pythonserialization/PythonSerialization.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.pythonserialization;
16 | 
17 | import org.apache.hadoop.io.serializer.Deserializer;
18 | import org.apache.hadoop.io.serializer.Serialization;
19 | import org.apache.hadoop.io.serializer.Serializer;
20 | import org.python.core.PyObject;
21 | 
22 | /**
23 |  * Hadoop Serialization class for Python objects.
24 |  * 
25 |  * @author Gabor Szabo
26 |  */
27 | public class PythonSerialization implements Serialization<PyObject> {
28 | 
29 |   public boolean accept(Class<?> c) {
30 |     boolean ret = PyObject.class.isAssignableFrom(c);
31 |     return ret;
32 |   }
33 | 
34 |   public Deserializer<PyObject> getDeserializer(Class<PyObject> c) {
35 |     return new PythonDeserializer(c);
36 |   }
37 | 
38 |   public Serializer<PyObject> getSerializer(Class<PyObject> c) {
39 |     return new PythonSerializer();
40 |   }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/java/src/com/twitter/pycascading/pythonserialization/PythonSerializer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2011 Twitter, Inc.
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  * 
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  * 
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  */
15 | package com.twitter.pycascading.pythonserialization;
16 | 
17 | import java.io.DataOutputStream;
18 | import java.io.IOException;
19 | import java.io.ObjectOutputStream;
20 | import java.io.OutputStream;
21 | 
22 | import org.apache.hadoop.io.serializer.Serializer;
23 | import org.python.core.PyObject;
24 | 
25 | /**
26 |  * Hadoop Serializer for Python objects.
27 |  * 
28 |  * This is suboptimal, slow, and produces bloated streams, and should not be
29 |  * used in production. In other words it just demonstrates the use of serialized
30 |  * Python objects.
31 |  * 
32 |  * @author Gabor Szabo
33 |  */
34 | public class PythonSerializer implements Serializer<PyObject> {
35 |   private DataOutputStream outStream;
36 | 
37 |   public void open(OutputStream outStream) throws IOException {
38 |     if (outStream instanceof DataOutputStream)
39 |       this.outStream = (DataOutputStream) outStream;
40 |     else
41 |       this.outStream = new DataOutputStream(outStream);
42 |   }
43 | 
44 |   public void serialize(PyObject i) throws IOException {
45 |     // We have to create an ObjectOutputStream here. If we do it in open(...),
46 |     // the following exception will be thrown on the reducers from
47 |     // PythonDeserializer with large jobs:
48 |     // java.io.StreamCorruptedException: invalid stream header: 7371007E
49 |     // TODO: check if a flush wouldn't be enough
50 |     ObjectOutputStream out = new ObjectOutputStream(outStream);
51 |     out.writeObject(i);
52 |     // We need to flush the stream, otherwise we get corrupted object stream
53 |     // header exceptions as above.
54 |     // Also do not use close(), as that would close result in
55 |     // java.io.IOException: write beyond end of stream exceptions on spilled
56 |     // cogroups.
57 |     out.flush();
58 |   }
59 | 
60 |   public void close() throws IOException {
61 |     if (outStream != null) {
62 |       outStream.close();
63 |       outStream = null;
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/local_run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Copyright 2011 Twitter, Inc.
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #
19 | # Runs the PyCascading locally without Hadoop
20 | #
21 | 
22 | 
23 | usage()
24 | {
25 |     cat <<EOF
26 | Usage:
27 | 
28 | $(basename "$0") <main_script.py> [parameters]
29 | 
30 | Runs the PyCascading script locally, without a Hadoop cluster.
31 | 
32 | Options:
33 |    -h                Show this message
34 |    -j <cp>           Additional jar files and Python import folders to be added
35 |                      to the classpath. cp is a list of file and folder locations
36 |                      separated by ":"s
37 | 
38 | EOF
39 | }
40 | 
41 | 
42 | while getopts ":hj:" OPTION; do
43 |     case $OPTION in
44 |         h)  usage
45 |             exit 1
46 |             ;;
47 |         j)  additional_jars="$OPTARG"
48 |             ;;
49 |     esac
50 | done
51 | shift $((OPTIND-1))
52 | 
53 | main_file="$1"
54 | if [ "$main_file" == "" ]; then
55 |     usage
56 |     exit 1
57 | fi
58 | 
59 | home_dir=$(dirname "$0")
60 | source "$home_dir/java/dependencies.properties"
61 | 
62 | classpath="$home_dir/build/classes"
63 | 
64 | function add2classpath
65 | {
66 | 	for lib in $1; do
67 | 		for file in $(ls $2/$lib); do
68 | 			classpath="$classpath:$file"
69 | 		done
70 | 	done
71 | }
72 | 
73 | # Jython jars
74 | jython_libs='jython.jar'
75 | add2classpath "$jython_libs" "$jython"
76 | 
77 | # Cascading jars
78 | cascading_libs='cascading-[0-9].*.jar lib/jgrapht-*.jar'
79 | add2classpath "$cascading_libs" "$cascading"
80 | 
81 | # Hadoop jars
82 | hadoop_libs='hadoop-*core*.jar lib/*.jar'
83 | add2classpath "$hadoop_libs" "$hadoop"
84 | 
85 | if [ "$additional_jars" != "" ]; then
86 |     classpath="$classpath:$additional_jars"
87 | fi
88 | 
89 | # sys.path will be initialized from JYTHONPATH
90 | JYTHONPATH="$home_dir/python" java -classpath "$classpath" \
91 | com.twitter.pycascading.Main "$home_dir/python/pycascading/bootstrap.py" \
92 | local "$home_dir" "$@"
93 | 


--------------------------------------------------------------------------------
/python/pycascading/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """
17 | PyCascading is a Python frontend to build and execute MapReduce flows
18 | in Cascading.
19 | """
20 | 


--------------------------------------------------------------------------------
/python/pycascading/bootstrap.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Bootstrap the PyCascading script.
17 | 
18 | This is the main Python module that gets executed by Hadoop or in local mode.
19 | The first command line argument is either 'local' or 'hadoop'. This determines
20 | whether we're running the script in local mode or with Hadoop. For Hadoop we
21 | need to pack the sources into a jar, which are extracted later to a temporary
22 | directory, so we need to set up the search paths differently in this case.
23 | """
24 | 
25 | __author__ = 'Gabor Szabo'
26 | 
27 | 
28 | import sys, imp
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     # The first command line parameter must be 'hadoop' or 'local'
33 |     # to indicate the running mode
34 |     running_mode = sys.argv[1]
35 | 
36 |     # The second is the location of the PyCascading Python sources in local
37 |     # mode, and the PyCascading tarball in Hadoop mode
38 |     python_dir = sys.argv[2]
39 | 
40 |     # Remove the first two arguments so that sys.argv will look like as
41 |     # if it was coming from a simple command line execution
42 |     # The further parameters are the command line parameters to the script
43 |     sys.argv = sys.argv[3:]
44 | 
45 |     from com.twitter.pycascading import Util
46 | 
47 |     cascading_jar = Util.getCascadingJar()
48 |     # This is the folder where Hadoop extracted the jar file for execution
49 |     tmp_dir = Util.getJarFolder()
50 | 
51 |     Util.setPycascadingRoot(python_dir)
52 | 
53 |     # The initial value of sys.path is JYTHONPATH plus whatever Jython appends
54 |     # to it (normally the Python standard libraries the come with Jython)
55 |     sys.path.extend((cascading_jar, '.', tmp_dir, python_dir + '/python',
56 |                      python_dir + '/python/Lib'))
57 | 
58 |     # Allow the importing of user-installed Jython packages
59 |     import site
60 |     site.addsitedir(python_dir + 'python/Lib/site-packages')
61 | 
62 |     import os
63 |     import encodings
64 |     import pycascading.pipe, getopt
65 | 
66 |     # This holds some global configuration parameters
67 |     pycascading.pipe.config = dict()
68 | 
69 |     opts, args = getopt.getopt(sys.argv, 'a:')
70 |     pycascading.pipe.config['pycascading.distributed_cache.archives'] = []
71 |     for opt in opts:
72 |         if opt[0] == '-a':
73 |             pycascading.pipe.config['pycascading.distributed_cache.archives'] \
74 |             .append(opt[1])
75 | 
76 |     # This is going to be seen by main()
77 |     sys.argv = args
78 | 
79 |     # It's necessary to put this import here, otherwise simplejson won't work.
80 |     # Maybe it's automatically imported in the beginning of a Jython program,
81 |     # but since at that point the sys.path is not set yet to Lib, it will fail?
82 |     # Instead, we can use Java's JSON decoder...
83 | #    import encodings
84 | 
85 |     # pycascading.pipe.config is a dict with configuration parameters
86 |     pycascading.pipe.config['pycascading.running_mode'] = running_mode
87 |     pycascading.pipe.config['pycascading.main_file'] = args[0]
88 | 
89 |     # Import and run the user's script
90 |     _main_module_ = imp.load_source('__main__', \
91 |         pycascading.pipe.config['pycascading.main_file'])
92 |     _main_module_.main()
93 | 


--------------------------------------------------------------------------------
/python/pycascading/cogroup.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Operations related to a CoGroup pipe."""
 17 | 
 18 | __author__ = 'Gabor Szabo'
 19 | 
 20 | 
 21 | import cascading.pipe
 22 | import cascading.pipe.cogroup
 23 | import cascading.operation
 24 | 
 25 | from pycascading.pipe import Operation, coerce_to_fields, _Stackable
 26 | 
 27 | 
 28 | class CoGroup(Operation):
 29 | 
 30 |     """CoGroup two or more streams on common fields.
 31 | 
 32 |     This is a PyCascading wrapper around a Cascading CoGroup.
 33 |     """
 34 | 
 35 |     def __init__(self, *args, **kwargs):
 36 |         """Create a Cascading CoGroup pipe.
 37 | 
 38 |         Arguments:
 39 |         args[0] -- the fields on which to join
 40 | 
 41 |         Keyword arguments:
 42 |         group_name -- the groupName parameter for Cascading
 43 |         group_fields -- the fields on which to group
 44 |         declared_fields -- the declaredFields parameter for Cascading
 45 |         result_group_fields -- the resultGroupFields parameter for Cascading
 46 |         joiner -- the joiner parameter for Cascading
 47 |         num_self_joins -- the numSelfJoins parameter for Cascading
 48 |         lhs -- the lhs parameter for Cascading
 49 |         lhs_group_fields -- the lhsGroupFields parameter for Cascading
 50 |         rhs -- the rhs parameter for Cascading
 51 |         rhs_group_fields -- the rhsGroupFields parameter for Cascading
 52 |         """
 53 |         Operation.__init__(self)
 54 |         self.__args = args
 55 |         self.__kwargs = kwargs
 56 | 
 57 |     def __create_args(self,
 58 |                       group_name=None,
 59 |                       pipes=None, group_fields=None, declared_fields=None,
 60 |                       result_group_fields=None, joiner=None,
 61 |                       pipe=None, num_self_joins=None,
 62 |                       lhs=None, lhs_group_fields=None,
 63 |                       rhs=None, rhs_group_fields=None):
 64 |         # We can use an unnamed parameter only for group_fields
 65 |         if self.__args:
 66 |             group_fields = [coerce_to_fields(f) for f in self.__args[0]]
 67 |         args = []
 68 |         if group_name:
 69 |             args.append(str(group_name))
 70 |         if lhs:
 71 |             args.append(lhs.get_assembly())
 72 |             args.append(coerce_to_fields(lhs_group_fields))
 73 |             args.append(rhs.get_assembly())
 74 |             args.append(coerce_to_fields(rhs_group_fields))
 75 |             if declared_fields:
 76 |                 args.append(coerce_to_fields(declared_fields))
 77 |                 if result_group_fields:
 78 |                     args.append(coerce_to_fields(result_group_fields))
 79 |             if joiner:
 80 |                 args.append(joiner)
 81 |         elif pipes:
 82 |             args.append([p.get_assembly() for p in pipes])
 83 |             if group_fields:
 84 |                 args.append([coerce_to_fields(f) for f in group_fields])
 85 |                 if declared_fields:
 86 |                     args.append(coerce_to_fields(declared_fields))
 87 |                     if result_group_fields:
 88 |                         args.append(coerce_to_fields(result_group_fields))
 89 |                 else:
 90 |                     args.append(None)
 91 |                 if joiner is None:
 92 |                     joiner = cascading.pipe.cogroup.InnerJoin()
 93 |                 args.append(joiner)
 94 |         elif pipe:
 95 |             args.append(pipe.get_assembly())
 96 |             args.append(coerce_to_fields(group_fields))
 97 |             args.append(int(num_self_joins))
 98 |             if declared_fields:
 99 |                 args.append(coerce_to_fields(declared_fields))
100 |                 if result_group_fields:
101 |                     args.append(coerce_to_fields(result_group_fields))
102 |             if joiner:
103 |                 args.append(joiner)
104 |         return args
105 | 
106 |     def _create_with_parent(self, parent):
107 |         if isinstance(parent, _Stackable):
108 |             args = self.__create_args(pipes=parent.stack, **self.__kwargs)
109 |         else:
110 |             args = self.__create_args(pipe=parent, **self.__kwargs)
111 |         return cascading.pipe.CoGroup(*args)
112 | 
113 | 
114 | def inner_join(*args, **kwargs):
115 |     """Shortcut for an inner join."""
116 |     kwargs['joiner'] = cascading.pipe.cogroup.InnerJoin()
117 |     if not 'declared_fields' in kwargs:
118 |         kwargs['declared_fields'] = None
119 |     return CoGroup(*args, **kwargs)
120 | 
121 | 
122 | def outer_join(*args, **kwargs):
123 |     """Shortcut for an outer join."""
124 |     kwargs['joiner'] = cascading.pipe.cogroup.OuterJoin()
125 |     if not 'declared_fields' in kwargs:
126 |         kwargs['declared_fields'] = None
127 |     return CoGroup(*args, **kwargs)
128 | 
129 | 
130 | def left_outer_join(*args, **kwargs):
131 |     """Shortcut for a left outer join."""
132 |     # The documentation says a Cascading RightJoin is a right inner join, but
133 |     # that's not true, it's really an outer join as it should be.
134 |     kwargs['joiner'] = cascading.pipe.cogroup.LeftJoin()
135 |     if not 'declared_fields' in kwargs:
136 |         kwargs['declared_fields'] = None
137 |     return CoGroup(*args, **kwargs)
138 | 
139 | 
140 | def right_outer_join(*args, **kwargs):
141 |     """Shortcut for a right outer join."""
142 |     kwargs['joiner'] = cascading.pipe.cogroup.RightJoin()
143 |     if not 'declared_fields' in kwargs:
144 |         kwargs['declared_fields'] = None
145 |     return CoGroup(*args, **kwargs)
146 | 


--------------------------------------------------------------------------------
/python/pycascading/decorators.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """
 17 | PyCascading function decorators to be used with user-defined functions.
 18 | 
 19 | A user-defined function is a function that gets applied as a filter or an
 20 | Each function for each tuple, or the reduce-side function for tuples in a
 21 | grouping in an Every Cascading operation.
 22 | 
 23 | UDFs can emit a new set of tuples (as in a Function after an Each operation),
 24 | keep or filter out tuples (a Filter after an Each), or emit aggregate values
 25 | (an Aggregator or Buffer for a group after an Every).
 26 | 
 27 | We use globally or locally scoped Python functions to perform these
 28 | user-defined operations. When building the data processing pipeline, we can
 29 | simply stream data through a Python function with PyCascading if it was
 30 | decorated by one of the decorators.
 31 | 
 32 | * A udf_'map' function is executed for each input tuple, and returns no, one, or
 33 | several new output tuples.
 34 | 
 35 | * A 'udf_filter' is a boolean-valued function, which should return true if the
 36 | input tuple should be kept for the output, and false if not.
 37 | 
 38 | * A 'udf_buffer' is a function that is applied to groups of tuples, and is the
 39 | equivalent of a Cascading Buffer. It returns an aggregate after iterating
 40 | through the tuples in the group.
 41 | 
 42 | Exports the following:
 43 | udf
 44 | yields
 45 | numargs_expected
 46 | python_list_expected
 47 | python_dict_expected
 48 | collects_output
 49 | produces_python_list
 50 | produces_tuples
 51 | udf_filter
 52 | udf_map
 53 | udf_buffer
 54 | """
 55 | 
 56 | __author__ = 'Gabor Szabo'
 57 | 
 58 | import inspect
 59 | 
 60 | from pycascading.pipe import DecoratedFunction
 61 | from com.twitter.pycascading import CascadingBaseOperationWrapper
 62 | from com.twitter.pycascading import CascadingRecordProducerWrapper
 63 | 
 64 | 
 65 | def _function_decorator(args, kwargs, defaults={}):
 66 |     """
 67 |     A decorator to recursively decorate a function with arbitrary attributes.
 68 |     """
 69 | 
 70 |     def fun_decorator(function_or_callabledict):
 71 |         if isinstance(function_or_callabledict, DecoratedFunction):
 72 |             # Another decorator is next
 73 |             dff = function_or_callabledict
 74 |         else:
 75 |             # The original function comes next
 76 |             dff = DecoratedFunction.decorate_function(function_or_callabledict)
 77 |         # Add the attributes to the decorated function
 78 |         dff.decorators.update(additional_parameters)
 79 |         return dff
 80 | 
 81 |     additional_parameters = dict(defaults)
 82 |     additional_parameters.update(kwargs)
 83 |     if len(args) == 1 and not kwargs and (inspect.isroutine(args[0]) or isinstance(args[0], DecoratedFunction)):
 84 |         # We used the decorator without ()s, the first argument is the
 85 |         # function. We cannot use additional parameters in this case.
 86 |         return fun_decorator(args[0])
 87 |     else:
 88 |         return fun_decorator
 89 | 
 90 | 
 91 | def udf(*args, **kwargs):
 92 |     """The function can receive tuples or groups of tuples from Cascading.
 93 | 
 94 |     This is the decorator to use when we have a function that we want to use
 95 |     in a Cascading job after an Each or Every.
 96 |     """
 97 |     return _function_decorator(args, kwargs)
 98 | 
 99 | 
100 | def yields(*args, **kwargs):
101 |     """The function is a generator that yields output tuples.
102 | 
103 |     PyCascading considers this function a generator that yields one or more
104 |     output tuples before returning. If this decorator is not used, the way the
105 |     function emits tuples is determined automatically at runtime the first time
106 |     the funtion is called. The alternative to yielding values is to return
107 |     one tuple with return.
108 | 
109 |     We can safely yield Nones or not yield anything at all; no output tuples
110 |     will be emitted in this case.  
111 |     """
112 |     return _function_decorator(args, kwargs, \
113 |     { 'output_method' : CascadingRecordProducerWrapper.OutputMethod.YIELDS })
114 | 
115 | 
116 | def numargs_expected(num, *args, **kwargs):
117 |     """The function expects a num number of fields in the input tuples.
118 | 
119 |     Arguments:
120 |     num -- the exact number of fields that the input tuples must have
121 |     """
122 |     return _function_decorator(args, kwargs, { 'numargs_expected' : num })
123 | 
124 | 
125 | def python_list_expected(*args, **kwargs):
126 |     """PyCascading will pass in the input tuples as Python lists.
127 | 
128 |     There is some performance penalty as all the incoming tuples need to be
129 |     converted to Python lists.
130 |     """
131 |     params = dict(kwargs)
132 |     params.update()
133 |     return _function_decorator(args, kwargs, { 'input_conversion' : \
134 |     CascadingBaseOperationWrapper.ConvertInputTuples.PYTHON_LIST })
135 | 
136 | 
137 | def python_dict_expected(*args, **kwargs):
138 |     """The input tuples are converted to Python dicts for this function.
139 | 
140 |     PyCascading will convert all input tuples to a Python dict for this
141 |     function. The keys of the dict are the Cascading field names and the values
142 |     are the values read from the tuple.
143 | 
144 |     There is some performance penalty as all the incoming tuples need to be
145 |     converted to Python dicts.
146 |     """
147 |     return _function_decorator(args, kwargs, { 'input_conversion' : \
148 |     CascadingBaseOperationWrapper.ConvertInputTuples.PYTHON_DICT })
149 | 
150 | 
151 | def collects_output(*args, **kwargs):
152 |     """The function expects an output collector where output tuples are added.
153 | 
154 |     PyCascading will pass in a Cascading TupleEntryCollector to which the
155 |     function can add output tuples by calling its 'add' method.
156 | 
157 |     Use this if performance is important, as no conversion takes place between
158 |     Python objects and Cascading tuples.
159 |     """
160 |     return _function_decorator(args, kwargs, { 'output_method' : \
161 |     CascadingRecordProducerWrapper.OutputMethod.COLLECTS })
162 | 
163 | 
164 | def produces_python_list(*args, **kwargs):
165 |     """The function emits Python lists as tuples.
166 | 
167 |     These will be converted by PyCascading to Cascading Tuples, so this impacts
168 |     performance somewhat.
169 |     """
170 |     return _function_decorator(args, kwargs, { 'output_type' : \
171 |     CascadingRecordProducerWrapper.OutputType.PYTHON_LIST })
172 | 
173 | 
174 | def produces_tuples(*args, **kwargs):
175 |     """The function emits native Cascading Tuples or TupleEntrys.
176 | 
177 |     No conversion takes place so this is a fast way to add tuples to the
178 |     output.
179 |     """
180 |     return _function_decorator(args, kwargs, { 'output_type' : \
181 |     CascadingRecordProducerWrapper.OutputType.TUPLE })
182 | 
183 | 
184 | def udf_filter(*args, **kwargs):
185 |     """This makes the function a filter.
186 | 
187 |     The function should return 'true' for each input tuple that should stay
188 |     in the output stream, and 'false' if it is to be removed.
189 | 
190 |     IMPORTANT: this behavior is the opposite of what Cascading expects, but
191 |     similar to how the Python filter works!
192 | 
193 |     Note that the same effect can be attained by a map that returns the tuple
194 |     itself or None if it should be filtered out.
195 |     """
196 |     return _function_decorator(args, kwargs, { 'type' : 'filter' })
197 | 
198 | 
199 | def udf_map(*args, **kwargs):
200 |     """The function decorated with this emits output tuples for each input one.
201 | 
202 |     The function is called for all the tuples in the input stream as happens
203 |     in a Cascading Each. The function input tuple is passed in to the function
204 |     as the first parameter and is a native Cascading TupleEntry unless the
205 |     python_list_expected or python_dict_expected decorators are also used.
206 | 
207 |     If collects_output is used, the 2nd parameter is a Cascading
208 |     TupleEntryCollector to which Tuples or TupleEntrys can be added. Otherwise,
209 |     the function may return an output tuple or yield any number of tuples if
210 |     it is a generator.
211 | 
212 |     Whether the function yields or returns will be determined automatically if
213 |     no decorators used that specify this, and so will be the output tuple type
214 |     (it can be Python list or a Cascading Tuple).
215 | 
216 |     Note that the meaning of 'map' used here is closer to the Python map()
217 |     builtin than the 'map' in MapReduce. It essentially means that each input
218 |     tuple needs to be transformed (mapped) by a custom function.
219 | 
220 |     Arguments:
221 |     produces -- a list of output field names
222 |     """
223 |     return _function_decorator(args, kwargs, { 'type' : 'map' })
224 | 
225 | 
226 | def udf_buffer(*args, **kwargs):
227 |     """The function decorated with this takes a group and emits aggregates.
228 | 
229 |     A udf_buffer function must follow a Cascading Every operation, which comes
230 |     after a GroupBy. The function will be called for each grouping on a
231 |     different reducer. The first parameter passed to the function is the
232 |     value of the grouping field for this group, and the second is an iterator
233 |     to the tuples belonging to this group.
234 | 
235 |     Note that the iterator always points to a static variable in Cascading
236 |     that holds a copy of the current TupleEntry, thus we cannot cache this for
237 |     subsequent operations in the function. Instead, take iterator.getTuple() or
238 |     create a new TupleEntry by deep copying the item in the loop.
239 | 
240 |     Cascading also doesn't automatically add the group field to the output
241 |     tuples, so we need to do it manually. In fact a Cascading Buffer is more
242 |     powerful than an aggregator, although it can be used as one. It acts more
243 |     like a function emitting arbitrary tuples for groups, rather than just a
244 |     simple aggregator.
245 | 
246 |     By default the output tuples will be what the buffer returns or yields,
247 |     and the grouping fields won't be included. This is different from the
248 |     aggregators' behavior, which add the output fields to the grouping fields.
249 | 
250 |     Also, only one buffer may follow a GroupBy, in contrast to aggregators, of
251 |     which many may be present.
252 | 
253 |     See http://groups.google.com/group/cascading-user/browse_thread/thread/f5e5f56f6500ed53/f55fdd6bba399dcf?lnk=gst&q=scope#f55fdd6bba399dcf
254 |     """
255 |     return _function_decorator(args, kwargs, { 'type' : 'buffer' })
256 | 
257 | 
258 | def unwrap(*args, **kwargs):
259 |     """Unwraps the tuple into function parameters before calling the function.
260 | 
261 |     This is not implemented on the Java side yet.
262 |     """
263 |     return _function_decorator(args, kwargs, { 'parameters' : 'unwrap' })
264 | 
265 | def tuplein(*args, **kwargs):
266 |     return _function_decorator(args, kwargs, { 'parameters' : 'tuple' })
267 | 


--------------------------------------------------------------------------------
/python/pycascading/each.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Operations related to an Each pipe.
 17 | 
 18 | * Add fields to the stream: map_add
 19 | * Map fields to new fields: map_replace
 20 | * Map the whole tuple to the new tuple: map_to
 21 | * Filter tuples: filter_by
 22 | """
 23 | 
 24 | __author__ = 'Gabor Szabo'
 25 | 
 26 | 
 27 | import inspect
 28 | 
 29 | import cascading.pipe
 30 | from cascading.tuple import Fields
 31 | 
 32 | from com.twitter.pycascading import CascadingFunctionWrapper, \
 33 | CascadingFilterWrapper
 34 | 
 35 | from pycascading.pipe import Operation, coerce_to_fields, wrap_function, \
 36 | random_pipe_name, DecoratedFunction
 37 | from pycascading.decorators import udf
 38 | 
 39 | 
 40 | class _Each(Operation):
 41 | 
 42 |     """The equivalent of Each in Cascading.
 43 | 
 44 |     We need to wrap @maps and @filters with different Java classes, but
 45 |     the constructors for Each are built similarly. This class provides this
 46 |     functionality.
 47 |     """
 48 | 
 49 |     def __init__(self, function_type, *args):
 50 |         """Build the Each constructor for the Python function.
 51 | 
 52 |         Arguments:
 53 |         function_type -- CascadingFunctionWrapper or CascadingFilterWrapper,
 54 |             whether we are calling Each with a function or filter
 55 |         *args -- the arguments passed on to Cascading Each
 56 |         """
 57 |         Operation.__init__(self)
 58 | 
 59 |         self.__function = None
 60 |         # The default argument selector is Fields.ALL (per Cascading sources
 61 |         # for Operator.java)
 62 |         self.__argument_selector = None
 63 |         # The default output selector is Fields.RESULTS (per Cascading sources
 64 |         # for Operator.java)
 65 |         self.__output_selector = None
 66 | 
 67 |         if len(args) == 1:
 68 |             self.__function = args[0]
 69 |         elif len(args) == 2:
 70 |             (self.__argument_selector, self.__function) = args
 71 |         elif len(args) == 3:
 72 |             (self.__argument_selector, self.__function,
 73 |              self.__output_selector) = args
 74 |         else:
 75 |             raise Exception('The number of parameters to Apply/Filter ' \
 76 |                             'should be between 1 and 3')
 77 |         # This is the Cascading Function type
 78 |         self.__function = wrap_function(self.__function, function_type)
 79 | 
 80 |     def _create_with_parent(self, parent):
 81 |         args = []
 82 |         if self.__argument_selector:
 83 |             args.append(coerce_to_fields(self.__argument_selector))
 84 |         args.append(self.__function)
 85 |         if self.__output_selector:
 86 |             args.append(coerce_to_fields(self.__output_selector))
 87 |         # We need to put another Pipe after the Each since otherwise
 88 |         # joins may not work as the names of pipes apparently have to be
 89 |         # different for Cascading.
 90 |         each = cascading.pipe.Each(parent.get_assembly(), *args)
 91 |         return cascading.pipe.Pipe(random_pipe_name('each'), each)
 92 | 
 93 | 
 94 | class Apply(_Each):
 95 |     """Apply the given user-defined function to each tuple in the stream.
 96 | 
 97 |     The corresponding class in Cascading is Each called with a Function.
 98 |     """
 99 |     def __init__(self, *args):
100 |         _Each.__init__(self, CascadingFunctionWrapper, *args)
101 | 
102 | 
103 | class Filter(_Each):
104 |     """Filter the tuple stream through the user-defined function.
105 | 
106 |     The corresponding class in Cascading is Each called with a Filter.
107 |     """
108 |     def __init__(self, *args):
109 |         _Each.__init__(self, CascadingFilterWrapper, *args)
110 | 
111 | 
112 | def _any_instance(var, classes):
113 |     """Check if var is an instance of any class in classes."""
114 |     for cl in classes:
115 |         if isinstance(var, cl):
116 |             return True
117 |     return False
118 | 
119 | 
120 | def _map(output_selector, *args):
121 |     """Maps the given input fields to output fields."""
122 |     if len(args) == 1:
123 |         (input_selector, function, output_field) = \
124 |         (Fields.ALL, args[0], Fields.UNKNOWN)
125 |     elif len(args) == 2:
126 |         if inspect.isfunction(args[0]) or _any_instance(args[0], \
127 |         (DecoratedFunction, cascading.operation.Function, cascading.operation.Filter)):
128 |             # The first argument is a function, the second is the output fields
129 |             (input_selector, function, output_field) = \
130 |             (Fields.ALL, args[0], args[1])
131 |         else:
132 |             # The first argument is the input tuple argument selector,
133 |             # the second one is the function
134 |             (input_selector, function, output_field) = \
135 |             (args[0], args[1], Fields.UNKNOWN)
136 |     elif len(args) == 3:
137 |         (input_selector, function, output_field) = args
138 |     else:
139 |         raise Exception('map_{add,replace} needs to be called with 1 to 3 parameters')
140 |     if isinstance(function, DecoratedFunction):
141 |         # By default we take everything from the UDF's decorators
142 |         df = function
143 |         if output_field != Fields.UNKNOWN:
144 |             # But if we specified the output fields for the map, use that
145 |             df = DecoratedFunction.decorate_function(function.decorators['function'])
146 |             df.decorators = dict(function.decorators)
147 |             df.decorators['produces'] = output_field
148 |     elif inspect.isfunction(function):
149 |         df = udf(produces=output_field)(function)
150 |     else:
151 |         df = function
152 |     return Apply(input_selector, df, output_selector)
153 | 
154 | 
155 | def map_add(*args):
156 |     """Map the defined fields (or all fields), and add the results to the tuple.
157 | 
158 |     Note that the new field names we are adding to the tuple cannot overlap
159 |     with existing field names, or Cascading will complain.
160 |     """
161 |     return _map(Fields.ALL, *args)
162 | 
163 | 
164 | def map_replace(*args):
165 |     """Map the tuple, remove the mapped fields, and add the new fields.
166 | 
167 |     This mapping replaces the fields mapped with the new fields that the
168 |     mapping operation adds.
169 | 
170 |     The number of arguments to this function is between 1 and 3:
171 |     * One argument: it's the map function. The output fields will be named
172 |       after the 'produces' parameter if the map function is decorated, or
173 |       will be Fields.UNKNOWN if it's not defined. Note that after UNKNOW field
174 |       names are introduced to the tuple, all the other field names are also
175 |       lost.
176 |     * Two arguments: it's either the input field selector and the map function,
177 |       or the map function and the output fields' names.
178 |     * Three arguments: they are interpreted as the input field selector, the
179 |       map function, and finally the output fields' names.
180 |     """
181 |     return _map(Fields.SWAP, *args)
182 | 
183 | 
184 | def map_to(*args):
185 |     """Map the tuple, and keep only the results returned by the function."""
186 |     return _map(Fields.RESULTS, *args)
187 | 
188 | 
189 | def filter_by(function):
190 |     if isinstance(function, DecoratedFunction):
191 |         # We make sure we will treat the function as a filter
192 |         # Here we make a copy of the decorators so that we don't overwrite
193 |         # the original parameters
194 |         if function.decorators['type'] not in ('filter', 'auto'):
195 |             raise Exception('Function is not a filter')
196 |         df = DecoratedFunction.decorate_function(function.decorators['function'])
197 |         df.decorators = dict(function.decorators)
198 |         df.decorators['type'] = 'filter'
199 |     else:
200 |         df = udf(type='filter')(function)
201 |     return Filter(df)
202 | 


--------------------------------------------------------------------------------
/python/pycascading/every.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Operations related to an Every pipe."""
 17 | 
 18 | __author__ = 'Gabor Szabo'
 19 | 
 20 | 
 21 | import inspect
 22 | 
 23 | import cascading.pipe
 24 | import cascading.operation
 25 | from cascading.tuple import Fields
 26 | 
 27 | from com.twitter.pycascading import CascadingAggregatorWrapper, \
 28 | CascadingBufferWrapper
 29 | 
 30 | from pycascading.pipe import Operation, coerce_to_fields, wrap_function, \
 31 | random_pipe_name, DecoratedFunction, _Stackable
 32 | 
 33 | 
 34 | class Every(Operation):
 35 | 
 36 |     """Apply an operation to a group of tuples.
 37 | 
 38 |     This operation is similar to Apply, but can only follow a GroupBy or
 39 |     CoGroup. It runs a Cascading Aggregator or Buffer on every grouping.
 40 |     Native Java aggregators or buffers may be used, and also PyCascading
 41 |     @reduces.
 42 | 
 43 |     By default the tuples contain only the values in a group, but not the
 44 |     grouping field. This can be had from the group first parameter.
 45 |     """
 46 | 
 47 |     def __init__(self, *args, **kwargs):
 48 |         """Create a Cascading Every pipe.
 49 | 
 50 |         Keyword arguments:
 51 |         aggregator -- a Cascading aggregator (only either aggregator or buffer
 52 |             should be used)
 53 |         buffer -- a Cascading Buffer or a PyCascading @reduce function
 54 |         output_selector -- the outputSelector parameter for Cascading
 55 |         argument_selector -- the argumentSelector parameter for Cascading
 56 |         assertion_level -- the assertionLevel parameter for Cascading
 57 |         assertion -- the assertion parameter for Cascading
 58 |         """
 59 |         Operation.__init__(self)
 60 |         self.__args = args
 61 |         self.__kwargs = kwargs
 62 | 
 63 |     def __create_args(self,
 64 |                       pipe=None,
 65 |                       aggregator=None, output_selector=None,
 66 |                       assertion_level=None, assertion=None,
 67 |                       buffer=None,
 68 |                       argument_selector=None):
 69 |         if self.__args:
 70 |             # If we pass in an unnamed argument, try to determine its type
 71 |             if isinstance(self.__args[0], cascading.operation.Aggregator):
 72 |                 aggregator = self.__args[0]
 73 |             else:
 74 |                 buffer = self.__args[0]
 75 |         # Set up some defaults
 76 |         if argument_selector is None:
 77 |             argument_selector = cascading.tuple.Fields.ALL
 78 |         if output_selector is None:
 79 |             if aggregator is not None:
 80 |                 # In the case of aggregators, we want to return both the
 81 |                 # groupings and the results
 82 |                 output_selector = cascading.tuple.Fields.ALL
 83 |             else:
 84 |                 output_selector = cascading.tuple.Fields.RESULTS
 85 | 
 86 |         args = []
 87 |         args.append(pipe.get_assembly())
 88 |         if argument_selector is not None:
 89 |             args.append(coerce_to_fields(argument_selector))
 90 |         if aggregator is not None:
 91 |             # for now we assume it's a Cascading aggregator straight
 92 |             args.append(wrap_function(aggregator, CascadingAggregatorWrapper))
 93 |             if output_selector:
 94 |                 args.append(coerce_to_fields(output_selector))
 95 |         if assertion_level is not None:
 96 |             args.append(assertion_level)
 97 |             args.append(assertion)
 98 |         if buffer is not None:
 99 |             args.append(wrap_function(buffer, CascadingBufferWrapper))
100 |             if output_selector:
101 |                 args.append(coerce_to_fields(output_selector))
102 |         return args
103 | 
104 |     def _create_with_parent(self, parent):
105 |         args = self.__create_args(pipe=parent, **self.__kwargs)
106 |         return cascading.pipe.Every(*args)
107 | 
108 | 
109 | class GroupBy(Operation):
110 | 
111 |     """GroupBy first merges the given pipes, then groups by the fields given.
112 | 
113 |     This class does the same as the corresponding Cascading GroupBy.
114 |     """
115 | 
116 |     def __init__(self, *args, **kwargs):
117 |         """Create a Cascading Every pipe.
118 | 
119 |         Arguments:
120 |         args[0] -- the fields on which to group
121 | 
122 |         Keyword arguments:
123 |         group_name -- the groupName parameter for Cascading
124 |         group_fields -- the fields on which to group
125 |         sort_fields -- the sortFields parameter for Cascading
126 |         reverse_order -- the reverseOrder parameter for Cascading
127 |         lhs_pipe -- the lhsPipe parameter for Cascading
128 |         rhs_pipe -- the rhsPipe parameter for Cascading
129 |         """
130 |         Operation.__init__(self)
131 |         self.__args = args
132 |         self.__kwargs = kwargs
133 | 
134 |     def __create_args(self,
135 |                       group_name=None,
136 |                       pipes=None, group_fields=None, sort_fields=None,
137 |                       reverse_order=None,
138 |                       pipe=None,
139 |                       lhs_pipe=None, rhs_pipe=None):
140 |         # We can use an unnamed parameter only for group_fields
141 |         if self.__args:
142 |             group_fields = coerce_to_fields(self.__args[0])
143 |         args = []
144 |         if group_name:
145 |             args.append(group_name)
146 |         if pipes:
147 |             args.append([p.get_assembly() for p in pipes])
148 |             if group_fields:
149 |                 args.append(coerce_to_fields(group_fields))
150 |                 if sort_fields:
151 |                     args.append(coerce_to_fields(sort_fields))
152 |                     if reverse_order:
153 |                         args.append(reverse_order)
154 |         elif pipe:
155 |             args.append(pipe.get_assembly())
156 |             if group_fields:
157 |                 args.append(coerce_to_fields(group_fields))
158 |                 if sort_fields:
159 |                     args.append(coerce_to_fields(sort_fields))
160 |                 if reverse_order:
161 |                     args.append(reverse_order)
162 |         elif lhs_pipe:
163 |             args.append(lhs_pipe.get_assembly())
164 |             args.append(rhs_pipe.get_assembly())
165 |             args.append(coerce_to_fields(group_fields))
166 |         return args
167 | 
168 |     def _create_with_parent(self, parent):
169 |         if isinstance(parent, _Stackable):
170 |             # We're chaining with a _Stackable object
171 |             args = self.__create_args(pipes=parent.stack, **self.__kwargs)
172 |         else:
173 |             # We're chaining with a Chainable object
174 |             args = self.__create_args(pipe=parent, **self.__kwargs)
175 |         return cascading.pipe.GroupBy(*args)
176 | 
177 | 
178 | class _DelayedInitialization(Operation):
179 |     def __init__(self, callback):
180 |         Operation.__init__(self)
181 |         self.__callback = callback
182 | 
183 |     def _create_with_parent(self, parent):
184 |         return self.__callback(parent).get_assembly()
185 | 
186 | 
187 | def group_by(*args, **kwargs):
188 |     if len(args) == 0:
189 |         grouping_fields = None
190 |         parameters = ()
191 |     elif len(args) == 1:
192 |         grouping_fields = args[0]
193 |         parameters = ()
194 |     elif len(args) == 2:
195 |         grouping_fields = args[0]
196 |         parameters = (Fields.ALL, args[1], Fields.UNKNOWN)
197 |     elif len(args) == 3:
198 |         grouping_fields = args[0]
199 |         if inspect.isfunction(args[1]) or isinstance(args[1], \
200 |         (DecoratedFunction, cascading.operation.Aggregator, cascading.operation.Buffer)):
201 |             # The first argument is an aggregator/buffer,
202 |             # the second is the output fields
203 |             parameters = (Fields.ALL, args[1], args[2])
204 |         else:
205 |             parameters = (args[1], args[2], Fields.UNKNOWN)
206 |     elif len(args) == 4:
207 |         grouping_fields = args[0]
208 |         parameters = (args[1], args[2], args[3])
209 |     else:
210 |         raise Exception('group_by needs to be called with 1 to 4 parameters')
211 | 
212 |     if parameters:
213 |         (input_selector, function, output_field) = parameters
214 |         if isinstance(function, DecoratedFunction):
215 |             # By default we take everything from the UDF's decorators
216 |             df = function
217 |             if output_field != Fields.UNKNOWN:
218 |                 # But if we specified the output fields for the map, use that
219 |                 df = DecoratedFunction.decorate_function(function.decorators['function'])
220 |                 df.decorators = dict(function.decorators)
221 |                 df.decorators['produces'] = output_field
222 |         elif inspect.isfunction(function):
223 |             df = udf(produces=output_field)(function)
224 |         else:
225 |             df = function
226 |         def pipe(parent):
227 |             if grouping_fields:
228 |                 return parent | GroupBy(grouping_fields, **kwargs) | \
229 |                     Every(df, argument_selector=input_selector)
230 |             else:
231 |                 return parent | GroupBy(**kwargs) | \
232 |                     Every(df, argument_selector=input_selector)
233 |         return _DelayedInitialization(pipe)
234 |     else:
235 |         def pipe(parent):
236 |             if grouping_fields:
237 |                 return parent | GroupBy(grouping_fields, **kwargs)
238 |             else:
239 |                 return parent | GroupBy(**kwargs)
240 |         return _DelayedInitialization(pipe)
241 | 


--------------------------------------------------------------------------------
/python/pycascading/helpers.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Helper functions for a PyCascading script.
 17 | 
 18 | This module imports the PyCascading modules so that we don't have to import
 19 | them manually all the time. It also imports the Java classes wrapping the
 20 | primitive types (Integer, Long, Float, Double, String), so that casts are made
 21 | easy. Furthermore frequently used Cascading classes are also imported, such as
 22 | Fields, Tuple, and TupleEntry, and the pre-defined aggregators, filters,
 23 | assemblies, and schemes.
 24 | """
 25 | 
 26 | __author__ = 'Gabor Szabo'
 27 | 
 28 | 
 29 | import time, struct, subprocess
 30 | 
 31 | # Import frequently used Cascading classes
 32 | # We import these first so that we can override some global names (like Rename)
 33 | from cascading.tuple import Fields, Tuple, TupleEntry
 34 | from cascading.operation.aggregator import *
 35 | from cascading.operation.filter import *
 36 | from cascading.pipe.assembly import *
 37 | from cascading.scheme import *
 38 | from cascading.tap import *
 39 | 
 40 | # Import all important PyCascading modules so we don't have to in the scripts
 41 | from pycascading.decorators import *
 42 | from pycascading.tap import *
 43 | from pycascading.operators import *
 44 | from pycascading.each import *
 45 | from pycascading.every import *
 46 | from pycascading.cogroup import *
 47 | # We don't import * as the name of some functions (sum) collides with Python
 48 | import pycascading.native as native
 49 | 
 50 | # Import Java basic types for conversions
 51 | from java.lang import Integer, Long, Float, Double, String
 52 | 
 53 | import com.twitter.pycascading.SelectFields
 54 | from pycascading.pipe import coerce_to_fields
 55 | 
 56 | 
 57 | class Getter():
 58 | 
 59 |     """A wrapper for an object with 'get' and 'set' methods.
 60 | 
 61 |     If the object has a .get(key) method and a .set(key, value) method,
 62 |     these can be replaced by referencing the key with []s.
 63 |     """
 64 | 
 65 |     def __init__(self, object):
 66 |         self.object = object
 67 | 
 68 |     def __getitem__(self, key):
 69 |         return self.object.get(key)
 70 | 
 71 |     def __setitem__(self, key, value):
 72 |         return self.object.set(key, value)
 73 | 
 74 | 
 75 | def time2epoch(t):
 76 |     """Converts times in UTC to seconds since the UNIX epoch, 1/1/1970 00:00.
 77 | 
 78 |     Arguments:
 79 |     t -- the time string in 'YYYY-MM-DD hh:mm:ss' format
 80 | 
 81 |     Exceptions:
 82 |     Throws an exception if t is not in the right format.
 83 |     """
 84 |     t = time.strptime(t + ' UTC', '%Y-%m-%d %H:%M:%S.0 %Z')
 85 |     return int(time.mktime(t)) - time.timezone
 86 | 
 87 | 
 88 | def bigendian2long(b):
 89 |     """Converts a series of 4 bytes in big-endian format to a Java Long.
 90 | 
 91 |     Arguments:
 92 |     b -- a string of 4 bytes that represent a word
 93 |     """
 94 |     return Long(struct.unpack('>I', b)[0])
 95 | 
 96 | 
 97 | def bigendian2int(b):
 98 |     """Converts a series of 4 bytes in big-endian format to a Python int.
 99 | 
100 |     Arguments:
101 |     b -- a string of 4 bytes that represent a word
102 |     """
103 |     return struct.unpack('>i', b)[0]
104 | 
105 | 
106 | def SelectFields(fields):
107 |     """Keeps only some fields in the tuple stream.
108 | 
109 |     Arguments:
110 |     fields -- a list of fields to keep, or a Cascading Fields wildcard
111 |     """
112 |     return com.twitter.pycascading.SelectFields(coerce_to_fields(fields))
113 | 
114 | 
115 | def read_hdfs_tsv_file(path):
116 |     """Read a tab-separated HDFS folder and yield the records.
117 | 
118 |     The first line of the file should contain the name of the fields. Each
119 |     record contains columns separated by tabs.
120 | 
121 |     Arguments:
122 |     path -- path to a tab-separated folder containing the data files
123 |     """
124 |     pipe = subprocess.Popen('hdfs -cat "%s/.pycascading_header" "%s/part-*"' \
125 |     % (path, path), shell=True, stdout=subprocess.PIPE).stdout
126 |     first_line = True
127 |     for line in pipe:
128 |         line = line[0 : (len(line) - 1)]
129 |         fields = line.split('\t')
130 |         if first_line:
131 |             field_names = fields
132 |             first_line = False
133 |         else:
134 |             yield dict(zip(field_names, fields))
135 | 


--------------------------------------------------------------------------------
/python/pycascading/init_module.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Used internally. PyCascading module to set up the paths for the sources.
17 | 
18 | The module that gets loaded first when a Cascading pipeline is deserialized.
19 | PyCascading needs to start a Jython interpreter whenever a mapper or reducer
20 | executes Python code, so we need to start an interpreter, set up the
21 | environment, and load the job's source code.
22 | """
23 | 
24 | __author__ = 'Gabor Szabo'
25 | 
26 | 
27 | import sys
28 | 
29 | 
30 | def setup_paths(module_paths):
31 |     """Set up sys.path on the mappers and reducers.
32 | 
33 |     module_paths is an array of path names where the sources or other
34 |     supporting files are found. In particular, module_paths[0] is the location
35 |     of the PyCascading Python sources, and modules_paths[1] is the location of
36 |     the source file defining the function.
37 | 
38 |     In Hadoop mode (with remote_deploy.sh), the first two -a options must
39 |     specify the archives of the PyCascading sources and the job sources,
40 |     respectively.
41 | 
42 |     Arguments:
43 |     module_paths -- the locations of the Python sources 
44 |     """
45 |     from com.twitter.pycascading import Util
46 | 
47 |     cascading_jar = Util.getCascadingJar()
48 |     jython_dir = module_paths[0]
49 | 
50 |     sys.path.extend((cascading_jar, jython_dir + '/python',
51 |                      jython_dir + '/python/Lib'))
52 |     sys.path.extend(module_paths[1 : ])
53 | 
54 |     # Allow importing of user-installed Jython packages
55 |     # Thanks to Simon Radford
56 |     import site
57 |     site.addsitedir(jython_dir + 'python/Lib/site-packages')
58 | 
59 |     # Haha... it's necessary to put this here, otherwise simplejson won't work.
60 |     # Maybe it's automatically imported in the beginning of a Jython program,
61 |     # but since at that point the sys.path is not set yet to Lib, it will fail?
62 |     #import encodings
63 | 


--------------------------------------------------------------------------------
/python/pycascading/native.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Aggregators, filters, functions, and assemblies adapted to PyCascading.
 17 | 
 18 | These useful operations are provided by Cascading.
 19 | """
 20 | 
 21 | __author__ = 'Gabor Szabo'
 22 | 
 23 | 
 24 | import cascading.operation.aggregator as aggregator
 25 | import cascading.operation.filter as filter
 26 | import cascading.operation.function as function
 27 | import cascading.pipe.assembly as assembly
 28 | 
 29 | from pycascading.pipe import coerce_to_fields, SubAssembly
 30 | 
 31 | 
 32 | def average(*args):
 33 |     args = list(args)
 34 |     if args:
 35 |         args[0] = coerce_to_fields(args[0])
 36 |     return aggregator.Average(*args)
 37 | 
 38 | 
 39 | def count(*args):
 40 |     args = list(args)
 41 |     if args:
 42 |         args[0] = coerce_to_fields(args[0])
 43 |     return aggregator.Count(*args)
 44 | 
 45 | 
 46 | def first(*args):
 47 |     args = list(args)
 48 |     if args:
 49 |         args[0] = coerce_to_fields(args[0])
 50 |     return aggregator.First(*args)
 51 | 
 52 | 
 53 | def last(*args):
 54 |     args = list(args)
 55 |     if args:
 56 |         args[0] = coerce_to_fields(args[0])
 57 |     return aggregator.Last(*args)
 58 | 
 59 | 
 60 | def max(*args):
 61 |     args = list(args)
 62 |     if args:
 63 |         args[0] = coerce_to_fields(args[0])
 64 |     return aggregator.Max(*args)
 65 | 
 66 | 
 67 | def min(*args):
 68 |     args = list(args)
 69 |     if args:
 70 |         args[0] = coerce_to_fields(args[0])
 71 |     return aggregator.Min(*args)
 72 | 
 73 | 
 74 | def sum(*args):
 75 |     args = list(args)
 76 |     if args:
 77 |         args[0] = coerce_to_fields(args[0])
 78 |     return aggregator.Sum(*args)
 79 | 
 80 | 
 81 | def limit(lim):
 82 |     return filter.Limit(lim)
 83 | 
 84 | 
 85 | def sample(*args):
 86 |     return filter.Sample(lim)
 87 | 
 88 | 
 89 | def un_group(*args):
 90 |     args = list(args)
 91 |     if args:
 92 |         args[0] = coerce_to_fields(args[0])
 93 |     if len(args) > 1:
 94 |         if isinstance(args[1], (list, tuple)):
 95 |             new_arg = []
 96 |             for f in args[1]:
 97 |                 new_arg.append(coerce_to_fields(f))
 98 |             args[1] = new_arg
 99 |         else:
100 |             args[1] = coerce_to_fields(args[1])
101 |     if len(args) > 2:
102 |         if isinstance(args[2], (list, tuple)):
103 |             new_arg = []
104 |             for f in args[2]:
105 |                 new_arg.append(coerce_to_fields(f))
106 |             args[2] = new_arg
107 |     return function.UnGroup(*args)
108 | 
109 | 
110 | def average_by(*args):
111 |     args = list(args)
112 |     if len(args) > 0:
113 |         args[0] = coerce_to_fields(args[0])
114 |     if len(args) > 1:
115 |         args[1] = coerce_to_fields(args[1])
116 |     if len(args) > 2:
117 |         args[2] = coerce_to_fields(args[2])
118 |     return SubAssembly(assembly.AverageBy, *args)
119 | 
120 | 
121 | def count_by(*args):
122 |     args = list(args)
123 |     if len(args) > 0:
124 |         args[0] = coerce_to_fields(args[0])
125 |     if len(args) > 1:
126 |         args[1] = coerce_to_fields(args[1])
127 |     return SubAssembly(assembly.CountBy, *args)
128 | 
129 | 
130 | def sum_by(*args):
131 |     # SumBy has at least 3 parameters
132 |     args = list(args)
133 |     for i in xrange(0, 3):
134 |         args[i] = coerce_to_fields(args[i])
135 |     return SubAssembly(assembly.SumBy, *args)
136 | 
137 | 
138 | def unique(*args):
139 |     args = list(args)
140 |     args[0] = coerce_to_fields(args[0])
141 |     return SubAssembly(assembly.Unique, *args)
142 | 


--------------------------------------------------------------------------------
/python/pycascading/operators.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2011 Twitter, Inc.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | 
16 | """Various operations acting on the tuples.
17 | 
18 | * Select fields from the stream: retain
19 | * Remove fields from the stream: discard (not implemented in Cascading 1.2.*)
20 | * Rename fields: rename
21 | """
22 | 
23 | __author__ = 'Gabor Szabo'
24 | 
25 | 
26 | import itertools
27 | 
28 | from cascading.tuple import Fields
29 | from cascading.operation import Identity
30 | import cascading.pipe.assembly.Rename
31 | 
32 | from pycascading.pipe import SubAssembly, coerce_to_fields
33 | from pycascading.each import Apply
34 | 
35 | 
36 | def retain(*fields_to_keep):
37 |     """Retain only the given fields.
38 | 
39 |     The fields can be given in array or by separate parameters.
40 |     """
41 |     if len(fields_to_keep) > 1:
42 |         fields_to_keep = list(itertools.chain(fields_to_keep))
43 |     else:
44 |         fields_to_keep = fields_to_keep[0]
45 |     return Apply(fields_to_keep, Identity(Fields.ARGS), Fields.RESULTS)
46 | 
47 | 
48 | def _discard(fields_to_discard):
49 |     # In 2.0 there's a builtin function this, Discard
50 |     # In 1.2 there is nothing for this
51 |     raise Exception('Discard only works with Cascading 2.0')
52 | 
53 | 
54 | def rename(*args):
55 |     """Rename the fields to new names.
56 | 
57 |     If only one argument (a list of names) is given, it is assumed that the
58 |     user wants to rename all the fields. If there are two arguments, the first
59 |     list is the set of fields to be renamed, and the second is a list of the
60 |     new names.
61 |     """
62 |     if len(args) == 1:
63 |         (fields_from, fields_to) = (Fields.ALL, args[0])
64 |     else:
65 |         (fields_from, fields_to) = (args[0], args[1])
66 |     return SubAssembly(cascading.pipe.assembly.Rename, \
67 |                        coerce_to_fields(fields_from), \
68 |                        coerce_to_fields(fields_to))
69 | 


--------------------------------------------------------------------------------
/python/pycascading/pipe.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Build and execute Cascading flows in Python.
 17 | 
 18 | Flows are built from Cascading operations that reshape, join, and split
 19 | streams. Some operations make use of user-defined functions, for instance, the
 20 | Each operation applies an UDF to each tuple seen in the stream.
 21 | 
 22 | Exports the following:
 23 | Pipe
 24 | CoGroup
 25 | Join
 26 | OuterJoin
 27 | LeftOuterJoin
 28 | RightOuterJoin
 29 | SubAssembly
 30 | coerce_to_fields
 31 | random_pipe_name
 32 | """
 33 | 
 34 | __author__ = 'Gabor Szabo'
 35 | 
 36 | 
 37 | import types, inspect, pickle
 38 | 
 39 | import cascading.pipe
 40 | import cascading.tuple
 41 | import cascading.operation
 42 | import cascading.pipe.cogroup
 43 | from com.twitter.pycascading import CascadingBaseOperationWrapper, \
 44 | CascadingRecordProducerWrapper
 45 | 
 46 | import serializers
 47 | 
 48 | from java.io import ObjectOutputStream
 49 | 
 50 | 
 51 | import java.lang.Integer
 52 | 
 53 | 
 54 | def coerce_to_fields(obj):
 55 |     """
 56 |     Utility function to convert a list or field name to cascading.tuple.Fields.
 57 | 
 58 |     Arguments:
 59 |     obj -- a cascading.tuple.Fields, an integer, or a string, or a list of
 60 |         integers and/or strings identifying fields
 61 | 
 62 |     Return:
 63 |     obj coerced to a cascading.tuple.Fields object
 64 |     """
 65 |     if isinstance(obj, list):
 66 |         # For some reason integers will not be cast to Comparables by Jython,
 67 |         # so we need to do it manually before calling the Fields constructor
 68 |         for i in xrange(len(obj)):
 69 |             if isinstance(obj[i], int):
 70 |                 obj[i] = java.lang.Integer(obj[i])
 71 |         return cascading.tuple.Fields(obj)
 72 |     elif isinstance(obj, str) or isinstance(obj, int):
 73 |         if isinstance(obj, int):
 74 |             obj = java.lang.Integer(obj)
 75 |         return cascading.tuple.Fields([obj])
 76 |     else:
 77 |         # obj is assumed to be Fields already
 78 |         return obj
 79 | 
 80 | 
 81 | def random_pipe_name(prefix):
 82 |     """Generate a random string that can be used to name pipes.
 83 | 
 84 |     Otherwise Cascading always gets confused.
 85 |     """
 86 |     import random, re, traceback
 87 |     stack = traceback.extract_stack()
 88 |     stack.reverse()
 89 |     file = None
 90 |     for s in stack:
 91 |         if not re.match(r'.*/pycascading/[^/]+\.py$', s[0]) and \
 92 |         not re.match(r'.*/bootstrap.py$', s[0]):
 93 |             file = s[0]
 94 |             line = s[1]
 95 |             i = file.rfind('/')
 96 |             if i >= 0:
 97 |                 file = file[i + 1 :]
 98 |             break
 99 |     name = prefix
100 |     if file:
101 |         name = name + '/' + str(line) + ':' + file
102 |     name += ' '
103 |     id = ''
104 |     for i in xrange(0, 4):
105 |         name += chr(random.randint(ord('a'), ord('z')))
106 |     return name
107 | 
108 | 
109 | def wrap_function(function, casc_function_type):
110 |     """Wrap a Python function into a Serializable and callable Java object.
111 |     This wrapping is necessary as Cascading serializes the job pipeline before
112 |     it sends the job to the workers. We need to in essence reconstruct the
113 |     Python function from source on the receiving end when we deserialize the
114 |     function, as Python is an interpreted language.
115 | 
116 |     Arguments:
117 |     function -- either a Cascading Operation, a PyCascading-decorated Python
118 |         function, or a native Python function
119 |     casc_function_type -- the Cascading Operation that this Python function
120 |         will be called by in its operate method
121 |     """
122 |     if isinstance(function, cascading.operation.Operation):
123 |         return function
124 |     if isinstance(function, DecoratedFunction):
125 |         # Build the arguments for the constructor
126 |         args = []
127 |         decorators = function.decorators
128 |         if 'numargs_expected' in decorators:
129 |             args.append(decorators['numargs_expected'])
130 |         if 'produces' in decorators and decorators['produces']:
131 |             args.append(coerce_to_fields(decorators['produces']))
132 |         # Create the appropriate type (function or filter)
133 |         fw = casc_function_type(*args)
134 |         function = decorators['function']
135 |         fw.setConvertInputTuples(decorators['input_conversion'])
136 |         if decorators['type'] in set(['map', 'buffer', 'auto']):
137 |             fw.setOutputMethod(decorators['output_method'])
138 |             fw.setOutputType(decorators['output_type'])
139 |         fw.setContextArgs(decorators['args'])
140 |         fw.setContextKwArgs(decorators['kwargs'])
141 |     else:
142 |         # When function is a pure Python function, declared without decorators
143 |         fw = casc_function_type()
144 |     fw.setFunction(function)
145 |     fw.setWriteObjectCallBack(serializers.replace_object)
146 |     return fw
147 | 
148 | 
149 | class _Stackable(object):
150 | 
151 |     """An object that can be chained with '&' operations."""
152 | 
153 |     def __init__(self):
154 |         self.stack = [self]
155 | 
156 |     def __and__(self, other):
157 |         result = _Stackable()
158 |         result.stack = self.stack + other.stack
159 |         return result
160 | 
161 |     def __or__(self, other):
162 |         result = Chainable()
163 |         result._assembly = other._create_with_parent(self)
164 |         for s in self.stack:
165 |             result.add_context(s.context)
166 |         return result
167 | 
168 | 
169 | class Chainable(_Stackable):
170 | 
171 |     """An object that can be chained with '|' operations."""
172 | 
173 |     def __init__(self):
174 |         _Stackable.__init__(self)
175 |         self._assembly = None
176 |         self.context = set()
177 |         self.hash = 0
178 | 
179 |     def add_context(self, ctx):
180 |         # TODO: see if context is indeed needed
181 |         """
182 |         This is used to keep track of the sources connected to this pipeline
183 |         so that a possible cache can remove them for Cascading.
184 |         """
185 |         # Cannot use extend because of the strings
186 |         self.context.update(ctx)
187 | 
188 |     def get_assembly(self):
189 |         """Return the Cascading Pipe instance that this object represents."""
190 |         if self._assembly == None:
191 |             self._assembly = self._create_without_parent()
192 |         return self._assembly
193 | 
194 |     def __or__(self, other):
195 |         result = Chainable()
196 |         if isinstance(other, cascading.operation.Aggregator):
197 |             import every
198 |             other = every.Every(aggregator=other)
199 |         elif isinstance(other, cascading.operation.Function):
200 |             import each
201 |             other = each.Apply(other)
202 |         elif isinstance(other, cascading.operation.Filter):
203 |             import each
204 |             other = each.Apply(other)
205 |         elif inspect.isroutine(other):
206 |             other = DecoratedFunction.decorate_function(other)
207 |         if isinstance(other, Chainable):
208 |             result._assembly = other._create_with_parent(self)
209 |             result.add_context(self.context)
210 |             result.hash = self.hash ^ hash(result._assembly)
211 |         return result
212 | 
213 |     def _create_without_parent(self):
214 |         """Called when the Chainable is the first member of a chain.
215 | 
216 |         We want to initialize the chain with this operation as the first
217 |         member.
218 |         """
219 |         raise Exception('Cannot create without parent')
220 | 
221 |     def _create_with_parent(self, parent):
222 |         """Called when the Chainable is NOT the first member of a chain.
223 | 
224 |         Takes a PyCascading Pipe object, or a list thereof, and returns a
225 |         corresponding Cascading Pipe instance.
226 | 
227 |         Arguments:
228 |         parent -- the PyCascading pipe that we need to append this operation to
229 |         """
230 |         raise Exception('Cannot create with parent')
231 | 
232 | 
233 | class Pipe(Chainable):
234 | 
235 |     """The basic PyCascading Pipe object.
236 | 
237 |     This represents an operation on the tuple stream. A Pipe object can has an
238 |     upstream parent (unless it is a source), and a downstream child (unless it
239 |     is a sink).
240 |     """
241 | 
242 |     def __init__(self, name=None, *args):
243 |         Chainable.__init__(self)
244 |         if name:
245 |             self.__name = name
246 |         else:
247 |             self.__name = 'unnamed'
248 | 
249 |     def _create_without_parent(self):
250 |         """
251 |         Create the Cascading operation when this is the first element of a
252 |         chain.
253 |         """
254 |         return cascading.pipe.Pipe(self.__name)
255 | 
256 |     def _create_with_parent(self, parent):
257 |         """
258 |         Create the Cascading operation when this is not the first element
259 |         of a chain.
260 |         """
261 |         return cascading.pipe.Pipe(self.__name, parent.get_assembly())
262 | 
263 | 
264 | class Operation(Chainable):
265 | 
266 |     """A common base class for all operations (Functions, Filters, etc.).
267 | 
268 |     It doesn't do anything just provides the class.
269 |     """
270 | 
271 |     def __init__(self):
272 |         Chainable.__init__(self)
273 | 
274 | 
275 | class DecoratedFunction(Operation):
276 | 
277 |     """Decorates Python functions with arbitrary attributes.
278 | 
279 |     Additional attributes and the original functions are stored in a dict
280 |     self.decorators.
281 |     """
282 | 
283 |     def __init__(self):
284 |         Operation.__init__(self)
285 |         self.decorators = {}
286 | 
287 |     def __call__(self, *args, **kwargs):
288 |         """
289 |         When we call the function we don't actually want to execute it, just
290 |         to store the parameters passed to it so that we can distribute them
291 |         to workers as a shared context.
292 |         """
293 |         args, kwargs = self._wrap_argument_functions(args, kwargs)
294 |         if args:
295 |             self.decorators['args'] = args
296 |         if kwargs:
297 |             self.decorators['kwargs'] = kwargs
298 |         return self
299 | 
300 |     def _create_with_parent(self, parent):
301 |         """
302 |         Use the appropriate operation when the function is used in the pipe.
303 |         """
304 |         my_type = self.decorators['type']
305 |         if my_type == 'auto':
306 |             # Determine the type of function automatically based on the parent
307 |             if isinstance(parent, Chainable) and \
308 |             isinstance(parent.get_assembly(), cascading.pipe.GroupBy):
309 |                 my_type = 'buffer'
310 |             else:
311 |                 raise Exception('Function was not decorated with @udf_map or' \
312 |                                 ' @udf_filter, and I cannot decide if it is' \
313 |                                 ' a map or a filter')
314 |         if my_type == 'map':
315 |             import each
316 |             return each.Apply(self)._create_with_parent(parent)
317 |         elif my_type == 'filter':
318 |             import pycascading.each
319 |             return pycascading.each.Filter(self)._create_with_parent(parent)
320 |         elif my_type == 'buffer':
321 |             import every
322 |             return every.Every(buffer=self)._create_with_parent(parent)
323 |         else:
324 |             raise Exception('Function was not annotated with ' \
325 |                             '@udf_map(), @udf_filter(), or @udf_buffer()')
326 | 
327 |     def _wrap_argument_functions(self, args, kwargs):
328 |         """
329 |         Just like the nested function, any arguments that are functions
330 |         have to be wrapped.
331 |         """
332 |         args_out = []
333 |         for arg in args:
334 |             if type(arg) == types.FunctionType:
335 | #                args_out.append(_python_function_to_java(arg))
336 |                 args_out.append(arg)
337 |             else:
338 |                 args_out.append(arg)
339 |         for key in kwargs:
340 |             if type(kwargs[key]) == types.FunctionType:
341 | #                kwargs[key] = _python_function_to_java(kwargs[key])
342 |                 pass
343 |         return (tuple(args_out), kwargs)
344 | 
345 |     @classmethod
346 |     def decorate_function(cls, function):
347 |         """Return a DecoratedFunction with the default parameters set."""
348 |         dff = DecoratedFunction()
349 |         # This is the user-defined Python function
350 |         dff.decorators['function'] = function
351 |         # If it's used as an Each, Every, or Filter function
352 |         dff.decorators['type'] = 'auto'
353 |         dff.decorators['input_conversion'] = \
354 |         CascadingBaseOperationWrapper.ConvertInputTuples.NONE
355 |         dff.decorators['output_method'] = \
356 |         CascadingRecordProducerWrapper.OutputMethod.YIELDS_OR_RETURNS
357 |         dff.decorators['output_type'] = \
358 |         CascadingRecordProducerWrapper.OutputType.AUTO
359 |         dff.decorators['args'] = None
360 |         dff.decorators['kwargs'] = None
361 |         return dff
362 | 
363 | 
364 | class SubAssembly(Operation):
365 | 
366 |     """Pipe for a Cascading SubAssembly.
367 | 
368 |     We can use it in PyCascading to make use of existing subassemblies,
369 |     such as Unique.
370 |     """
371 | 
372 |     def __init__(self, sub_assembly_class, *args):
373 |         """Create a pipe for a Cascading SubAssembly.
374 | 
375 |         This makes use of a cascading.pipe.SubAssembly class.
376 | 
377 |         Arguments:
378 |         sub_assembly_class -- the Cascading SubAssembly class
379 |         *args -- parameters passed on to the subassembly's constructor when
380 |             it's initialized
381 |         """
382 |         self.__sub_assembly_class = sub_assembly_class
383 |         self.__args = args
384 | 
385 |     def _create_with_parent(self, parent):
386 |         pipe = self.__sub_assembly_class(parent.get_assembly(), *self.__args)
387 |         tails = pipe.getTails()
388 |         if len(tails) == 1:
389 |             result = tails[0]
390 |         else:
391 |             result = _Stackable()
392 |             result.stack = tails
393 |         return result
394 | 


--------------------------------------------------------------------------------
/python/pycascading/serializers.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Serialize a Python function.
 17 | 
 18 | This module will serialize a Python function in one of two ways:
 19 | * if the function is globally scoped, or a method of a class, it will
 20 |   serialize it by its name, the module, and class it was defined in. Note that
 21 |   methods of nested classes cannot be serialized, as nested classes don't hold
 22 |   references to their nesting class, so they cannot be reloaded from sources.
 23 | * if the function is scoped locally (nested), we grab its source so that it
 24 |   can be reloaded on deserialization.
 25 | 
 26 | Exports the following:
 27 | replace_object
 28 | """
 29 | 
 30 | 
 31 | import inspect, re, types
 32 | 
 33 | import pipe
 34 | 
 35 | 
 36 | def _remove_indents_from_function(code):
 37 |     """Remove leading indents from the function's source code.
 38 | 
 39 |     Otherwise an exec later when running the function would complain about
 40 |     the indents.
 41 |     """
 42 | 
 43 |     def swap_tabs_to_spaces(line):
 44 |         new_line = ''
 45 |         for i in xrange(0, len(line)):
 46 |             if line[i] == ' ':
 47 |                 new_line += line[i]
 48 |             elif line[i] == '\t':
 49 |                 new_line += ' ' * 8
 50 |             else:
 51 |                 new_line += line[i : len(line)]
 52 |                 break
 53 |         return new_line
 54 | 
 55 |     lines = code.split('\n')
 56 |     indent = -1
 57 |     for line in lines:
 58 |         m = re.match('^([ \t]*)def\s.*$', line)
 59 |         if m:
 60 |             #print line, 'x', m.group(1), 'x'
 61 |             indent = len(swap_tabs_to_spaces(m.group(1)))
 62 |             break
 63 |     if indent < 0:
 64 |         raise Exception('No def found for function source')
 65 |     #print 'indent', indent
 66 |     result = ''
 67 |     for line in lines:
 68 |         line = swap_tabs_to_spaces(line)
 69 |         i = 0
 70 |         while i < len(line):
 71 |             if i < indent and line[i] == ' ':
 72 |                 i += 1
 73 |             else:
 74 |                 break
 75 |         result += line[i : len(line)] + '\n'
 76 |     return result
 77 | 
 78 | 
 79 | def _get_source(func):
 80 |     """Return the source code for func."""
 81 |     return _remove_indents_from_function(inspect.getsource(func))
 82 | 
 83 | 
 84 | def function_scope(func):
 85 |     if (not inspect.isfunction(func)) and (not inspect.ismethod(func)):
 86 |         raise Exception('Expecting a (non-built-in) function or method')
 87 |     name = func.func_name
 88 |     module = inspect.getmodule(func)
 89 |     module_name = module.__name__
 90 |     if module_name == '__main__':
 91 |         module_name = ''
 92 |     enclosing_object = None
 93 |     if inspect.ismethod(func):
 94 |         if func.im_class == types.ClassType:
 95 |             # Function is a classmethod
 96 |             class_name = func.im_self.__name__
 97 |             if class_name in dir(module):
 98 |                 # Class is a top-level class in the module
 99 |                 type = 'classmethod'
100 |                 source = None
101 |             else:
102 |                 raise Exception('Class for @classmethod is nested, and Python '
103 |                                 'cannot determine the nesting class, '
104 |                                 'thus it\'s not allowed')
105 |         else:
106 |             # Function is a normal method
107 |             class_name = func.im_class.__name__
108 |             enclosing_object = func.im_self
109 |             if class_name in dir(module):
110 |                 # Class is a top-level class in the module
111 |                 type = 'method'
112 |                 source = None
113 |             else:
114 |                 raise Exception('The method\'s class is not top-level')
115 |     else:
116 |         # The function is a global or nested function, but not a method in a class
117 |         class_name = None
118 |         if name in dir(module):
119 |             # Function is a global function
120 |             type = 'global'
121 |             source = None
122 |         else:
123 |             # Function is a closure
124 |             type = 'closure'
125 |             source = _get_source(func)
126 |     return (type, module_name, class_name, name, source)
127 | 
128 | 
129 | def replace_object(obj):
130 |     if inspect.isfunction(obj):
131 |         return function_scope(obj)
132 |     else:
133 |         return None
134 | 


--------------------------------------------------------------------------------
/python/pycascading/tap.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2011 Twitter, Inc.
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | 
 16 | """Taps (sources and sinks) in PyCascading.
 17 | 
 18 | All taps need to be registered using this module because Cascading expects
 19 | them to be named by strings when running the flow.
 20 | 
 21 | Exports the following:
 22 | Flow
 23 | read_hdfs_tsv_file
 24 | """
 25 | 
 26 | __author__ = 'Gabor Szabo'
 27 | 
 28 | 
 29 | from pycascading.pipe import random_pipe_name, Chainable, Pipe
 30 | from com.twitter.pycascading import Util, MetaScheme
 31 | 
 32 | import cascading.tap
 33 | import cascading.scheme
 34 | from cascading.tuple import Fields
 35 | 
 36 | from org.apache.hadoop.fs import Path
 37 | from org.apache.hadoop.conf import Configuration
 38 | 
 39 | from pipe import random_pipe_name, Operation
 40 | 
 41 | 
 42 | def expand_path_with_home(output_folder):
 43 |     """Prepend the home folder to a relative location on HDFS if necessary.
 44 | 
 45 |     Only if we specified a relative path and no scheme, prepend it with the
 46 |     home folder of the user on HDFS. This behavior is similar to how
 47 |     "hadoop fs" works. If we are running in local mode, don't do anything.
 48 | 
 49 |     Arguments:
 50 |     output_folder -- the absolute or relative path of the output HDFS folder
 51 |     """
 52 |     import pycascading.pipe
 53 |     if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
 54 |         if not any(map(lambda scheme: output_folder.startswith(scheme), \
 55 |                        ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
 56 |             fs = Path('/').getFileSystem(Configuration())
 57 |             home_folder = fs.getHomeDirectory().toString()
 58 |             return home_folder + '/' + output_folder
 59 |     return output_folder
 60 | 
 61 | 
 62 | class Flow(object):
 63 | 
 64 |     """Define sources and sinks for the flow.
 65 | 
 66 |     This associates all sources and sinks with their head pipe mappings.
 67 |     The default number of reducers is 100. Set this in the num_reducers
 68 |     parameter when starting the flow with run().
 69 |     """
 70 | 
 71 |     def __init__(self):
 72 |         self.source_map = {}
 73 |         self.sink_map = {}
 74 |         self.tails = []
 75 | 
 76 |     def _connect_source(self, pipe_name, cascading_tap):
 77 |         """Add a source to the flow.
 78 | 
 79 |         Cascading needs to map taps to a pipeline with string names. This is
 80 |         inconvenient, but we need to keep track of these strings. We also need
 81 |         to count references to taps, as sometimes we need to remove pipelines
 82 |         due to replacement with a cache, and in this case we may also need to
 83 |         remove a tap. Otherwise Cascading complains about not all
 84 |         taps/pipelines being connected up to the flow.
 85 |         """
 86 |         self.source_map[pipe_name] = cascading_tap
 87 | 
 88 |     def source(self, cascading_tap):
 89 |         """A generic source using Cascading taps.
 90 | 
 91 |         Arguments:
 92 |         cascading_tap -- the Cascading Scheme object to store data into
 93 |         """
 94 |         # We can create the source tap right away and also use a Pipe to name
 95 |         # the head of this pipeline
 96 |         p = Pipe(name=random_pipe_name('source'))
 97 |         p.hash = hash(cascading_tap)
 98 |         p.add_context([p.get_assembly().getName()])
 99 |         self._connect_source(p.get_assembly().getName(), cascading_tap)
100 |         return p
101 | 
102 |     def meta_source(self, input_path):
103 |         """Use data files in a folder and read the scheme from the meta file.
104 | 
105 |         Defines a source tap using files in input_path, which should be a
106 |         (HDFS) folder. Takes care of using the appropriate scheme that was
107 |         used to store the data, using meta data in the data folder.
108 | 
109 |         Arguments:
110 |         input_path -- the HDFS folder to store data into
111 |         """
112 |         input_path = expand_path_with_home(input_path)
113 |         source_scheme = MetaScheme.getSourceScheme(input_path)
114 |         return self.source(cascading.tap.Hfs(source_scheme, input_path))
115 | 
116 |     def sink(self, cascading_scheme):
117 |         """A Cascading sink using a Cascading Scheme.
118 | 
119 |         Arguments:
120 |         cascading_scheme -- the Cascading Scheme used to store the data
121 |         """
122 |         return _Sink(self, cascading_scheme)
123 | 
124 |     def meta_sink(self, cascading_scheme, output_path):
125 |         """Store data together with meta information about the scheme used.
126 | 
127 |         A sink that also stores in a file information about the scheme used to
128 |         store data, and human-readable descriptions in the .pycascading_header
129 |         and .pycascading_types files with the field names and their types,
130 |         respectively.
131 | 
132 |         Arguments:
133 |         cascading_scheme -- the Cascading Scheme used to store data
134 |         output_path -- the folder where the output tuples should be stored.
135 |             If it exists, it will be erased and replaced!
136 |         """
137 |         output_path = expand_path_with_home(output_path)
138 |         sink_scheme = MetaScheme.getSinkScheme(cascading_scheme, output_path)
139 |         return self.sink(cascading.tap.Hfs(sink_scheme, output_path,
140 |                                            cascading.tap.SinkMode.REPLACE))
141 | 
142 |     def tsv_sink(self, output_path, fields=Fields.ALL):
143 |         # TODO: in local mode, do not prepend the home folder to the path
144 |         """A sink to store the tuples as tab-separated values in text files.
145 | 
146 |         Arguments:
147 |         output_path -- the folder for the output
148 |         fields -- the fields to store. Defaults to all fields.
149 |         """
150 |         output_path = expand_path_with_home(output_path)
151 |         return self.meta_sink(cascading.scheme.TextDelimited(fields, '\t'),
152 |                               output_path)
153 | 
154 |     def binary_sink(self, output_path, fields=Fields.ALL):
155 |         """A sink to store binary sequence files to store the output.
156 | 
157 |         This is a sink that uses the efficient Cascading SequenceFile scheme to
158 |         store data. This is a serialized version of all tuples and is
159 |         recommended when we want to store intermediate results for fast access
160 |         later.
161 | 
162 |         Arguments:
163 |         output_path -- the (HDFS) folder to store data into
164 |         fields -- the Cascading Fields field selector of which tuple fields to
165 |             store. Defaults to Fields.ALL.
166 |         """
167 |         output_path = expand_path_with_home(output_path)
168 |         return self.meta_sink(cascading.scheme.SequenceFile(fields),
169 |                               output_path)
170 | 
171 |     def cache(self, identifier, refresh=False):
172 |         """A sink for temporary results.
173 | 
174 |         This caches results into a temporary folder if the folder does not
175 |         exist yet. If we need to run slightly modified versions of the
176 |         PyCascading script several times during testing for instance, this is
177 |         very useful to store some results that can be reused without having to
178 |         go through the part of the flow that generated them again.
179 | 
180 |         Arguments:
181 |         identifier -- the unique identifier for this cache. This is used as
182 |             part of the path where the temporary files are stored.
183 |         refresh -- if True, we will regenerate the cache data as if it was
184 |             the first time creating it
185 |         """
186 |         return _Cache(self, identifier, refresh)
187 | 
188 |     def run(self, num_reducers=50, config=None):
189 |         """Start the Cascading job.
190 | 
191 |         We call this when we are done building the pipeline and explicitly want
192 |         to start the flow process.
193 |         """
194 |         sources_used = set([])
195 |         for tail in self.tails:
196 |             sources_used.update(tail.context)
197 |         # Remove unused sources from the source map
198 |         source_map = {}
199 |         for source in self.source_map.iterkeys():
200 |             if source in sources_used:
201 |                 source_map[source] = self.source_map[source]
202 |         tails = [t.get_assembly() for t in self.tails]
203 |         import pycascading.pipe
204 |         Util.run(num_reducers, pycascading.pipe.config, source_map, \
205 |                  self.sink_map, tails)
206 | 
207 | 
208 | class _Sink(Chainable):
209 | 
210 |     """A PyCascading sink that can be used as the tail in a pipeline.
211 | 
212 |     Used internally.
213 |     """
214 | 
215 |     def __init__(self, taps, cascading_tap):
216 |         Chainable.__init__(self)
217 |         self.__cascading_tap = cascading_tap
218 |         self.__taps = taps
219 | 
220 |     def _create_with_parent(self, parent):
221 |         # We need to name every tail differently so that Cascading can assign
222 |         # a tail map to all sinks.
223 |         # TODO: revise this after I name every pipe part separately
224 |         parent = parent | Pipe(name=random_pipe_name('sink'))
225 |         self.__taps.sink_map[parent.get_assembly().getName()] = \
226 |         self.__cascading_tap
227 |         self.__taps.tails.append(parent)
228 |         return None
229 | 
230 | 
231 | class _Cache:
232 | 
233 |     """Act as a source or sink to store and retrieve temporary data."""
234 | 
235 |     def __init__(self, taps, hdfs_folder, refresh=False):
236 |         tmp_folder = 'pycascading.cache/' + hdfs_folder
237 |         self.__cache_folder = expand_path_with_home(tmp_folder)
238 |         self.__hdfs_folder_exists = \
239 |         self.hdfs_folder_exists(self.__cache_folder)
240 |         self.__taps = taps
241 |         self.__refresh = refresh
242 | 
243 |     def hdfs_folder_exists(self, folder):
244 |         path = Path(folder)
245 |         fs = path.getFileSystem(Configuration())
246 |         try:
247 |             status = fs.getFileStatus(path)
248 |             # TODO: there could be problems if it exists but is a simple file
249 |             return status.isDir()
250 |         except:
251 |             return False
252 | 
253 |     def __or__(self, pipe):
254 |         if not self.__refresh and self.__hdfs_folder_exists:
255 |             # We remove all sources that are replaced by this cache, otherwise
256 |             # Cascading complains about unused source taps
257 |             return self.__taps.meta_source(self.__cache_folder)
258 |         else:
259 |             # We split the data into storing and processing pipelines
260 |             pipe | Pipe(random_pipe_name('cache')) | \
261 |             self.__taps.binary_sink(self.__cache_folder)
262 |             return pipe | Pipe(random_pipe_name('no_cache'))
263 | 


--------------------------------------------------------------------------------
/remote_deploy.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | #
  4 | # Copyright 2011 Twitter, Inc.
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | #
 19 | # This script is used to deploy a PyCascading job remotely to a server
 20 | # where Hadoop is installed. The variables below are the defaults.
 21 | #
 22 | 
 23 | # This is the default server where the PyCascading script will be submitted
 24 | # to Hadoop. We assume we have SSH access to this server.
 25 | server=localhost
 26 | 
 27 | # This is the folder on the remote server where a temporary directory is
 28 | # going to be created for the submission. $HOME is only expanded on the
 29 | # remote server.
 30 | server_deploys_dir='$HOME/pycascading/deploys'
 31 | 
 32 | # The folder on the remote server where the PyCascading master jar will be
 33 | # placed. This must be given as an absolute path name so that the master
 34 | # files can be found from any directory.
 35 | server_build_dir='$HOME/pycascading/master'
 36 | 
 37 | # Additional SSH options (see "man ssh"; private key, etc.)
 38 | ssh_options=""
 39 | 
 40 | # Additional Hadoop options to be put in the run.sh runner
 41 | hadoop_options=""
 42 | 
 43 | 
 44 | # Options over, the script begins here
 45 | 
 46 | usage()
 47 | {
 48 | 	cat <<EOF
 49 | Usage: $(basename "$0") [options] <main_script> [additional_files]
 50 | 
 51 | The main_script gets executed by PyCascading. All additional_files are also
 52 | copied to the remote server and submitted together with the job to Hadoop.
 53 | 
 54 | Options:
 55 |    -h                Show this message.
 56 | 
 57 |    -m                Also deploy the PyCascading master archives before submitting
 58 |                      the job. The master archives must be on the Hadoop server
 59 |                      before a job can be run.
 60 | 
 61 |    -f <file>         Copy file to the server together with main_script, but
 62 |                      do not bundle it up for submission. This option may be
 63 |                      repeated several times for multiple files. File names
 64 |                      cannot start with a dot.
 65 | 
 66 |    -s <server>       The name of the remote server where Hadoop is installed,
 67 |                      and the PyCascading scripts should be deployed to.
 68 | 
 69 |    -o <ssh_options>  Additional options for SSH (such as private key, etc.).
 70 |                      ssh_options is one string enclosed by "s or 's, even if
 71 |                      there are several parameters.
 72 | 
 73 |    -O <hadoop_opts>  Additional Hadoop options to be put in the running script.
 74 | 
 75 |    -r                Run the job immediately after submission with SSH. The
 76 |                      recommended way to run a script is either using screen
 77 |                      or nohup, so that the job doesn't get interrupted if the
 78 |                      terminal connection goes down. Note that no additional
 79 |                      command line parameters can be passed in this case for
 80 |                      the job.
 81 | 
 82 | EOF
 83 | }
 84 | 
 85 | 
 86 | # Returns the absolute path for the parameter. We cannot use either realpath
 87 | # or readlink, as these may not be installed on MacOS.
 88 | # Thanks to Simon Radford.
 89 | realpath()
 90 | {
 91 |     if echo "$1" | grep '^/' >/dev/null; then
 92 |         # Path is absolute
 93 |         echo "$1"
 94 |     else
 95 |         # Path is relative to the working directory
 96 |         echo "$(pwd)/$1"
 97 |     fi
 98 | }
 99 | 
100 | 
101 | # Remove the leading slashes from a path. This is needed when we package the
102 | # Python sources as tar does the same, and on extraction there are no leading
103 | # slashes.
104 | remove_leading_slash()
105 | {
106 |     echo "$1" | sed 's/^\/*//'
107 | }
108 | 
109 | 
110 | # Copy the master jar over first? The -m option.
111 | master_first=no
112 | 
113 | # Run job after submission with SSH?
114 | run_immediately='dont_run'
115 | 
116 | declare -a files_to_copy
117 | 
118 | while getopts ":hmf:s:o:O:r" OPTION; do
119 | 	case $OPTION in
120 | 		h)	usage
121 |          	exit 1
122 |          	;;
123 |         m)	master_first=yes
124 |         	;;
125 |         f)	files_to_copy=("${files_to_copy[@]}" "$OPTARG")
126 |         	;;
127 |         s)	server="$OPTARG"
128 |         	;;
129 |         o)  ssh_options="$OPTARG"
130 |             ;;
131 |         O)  hadoop_options="$OPTARG"
132 |             ;;
133 |         r)  run_immediately='do_run'
134 |             ;;
135 | 	esac
136 | done
137 | shift $((OPTIND-1))
138 | 
139 | main_file="$1"
140 | if [ "$main_file" == "" -a $master_first == no ]; then
141 | 	usage
142 | 	exit 3
143 | fi
144 | 
145 | home_dir=$(realpath $(dirname "$0"))
146 | # This is the version that works both on Linux and MacOS
147 | tmp_dir=$(mktemp -d -t PyCascading-tmp-XXXXXX)
148 | 
149 | if [ $master_first == yes ]; then
150 |     build_dir="$home_dir/build"
151 | 	if [ -a "$build_dir/pycascading.jar" -a \
152 | 	-a "$build_dir/pycascading.tgz" ]; then
153 | 		ln -s "$build_dir/pycascading.jar" "$build_dir/pycascading.tgz" \
154 | 		"$home_dir/python/pycascading/bootstrap.py" "$tmp_dir"
155 | 	else
156 | 	    echo 'Build the PyCascading master package first in the "java" folder with ant.'
157 | 		exit 2
158 | 	fi
159 | fi
160 | 
161 | if [ "$main_file" != "" ]; then
162 | 	tar -c -z -f "$tmp_dir/sources.tgz" "$@"
163 |     if [ ${#files_to_copy} -gt 0 ]; then
164 |         tar -c -z -f "$tmp_dir/others.tgz" "${files_to_copy[@]}"
165 |     fi
166 | fi
167 | 
168 | #
169 | # Create a setup file that will be run on the deploy server after everything
170 | # is copied over.
171 | #
172 | cat >"$tmp_dir/setup.sh" <<EOF
173 | #
174 | # This script is run on the deploy server to set up the PyCascading job folder
175 | #
176 | if [ -e pycascading.jar ]; then
177 |     # If we packaged the master jar, update it
178 |     mkdir -p "$server_build_dir"
179 |     mv pycascading.jar pycascading.tgz bootstrap.py "$server_build_dir"
180 |     rm -rf "$server_build_dir/python"
181 |     tar -x -z -f "$server_build_dir/pycascading.tgz" -C "$server_build_dir"
182 | fi
183 | if [ -e sources.tgz ]; then
184 |     mkdir -p "$server_deploys_dir"
185 |     deploy_dir=\$(mktemp -d "$server_deploys_dir/XXXXXX")
186 |     mkdir "\$deploy_dir/job"
187 |     mv run.sh "\$deploy_dir"
188 |     tar -x -z -f sources.tgz -C "\$deploy_dir/job"
189 |     mv sources.tgz "\$deploy_dir"
190 |     if [ -e others.tgz ]; then
191 |         tar -x -z -f others.tgz -C "\$deploy_dir/job"
192 |     fi
193 |     if [ ! -e "$server_build_dir/pycascading.jar" ]; then
194 |         echo 'WARNING!!!'
195 |         echo 'The PyCascading master jar has not yet been deployed, do a "remote_deploy.sh -m" first.'
196 |         echo
197 |     fi
198 |     echo "Run the job on $server with:"
199 |     echo "   \$deploy_dir/run.sh [parameters]"
200 | fi
201 | if [ \$1 == 'do_run' ]; then
202 |     \$deploy_dir/run.sh "\$@"
203 | fi
204 | EOF
205 | chmod +x "$tmp_dir/setup.sh"
206 | 
207 | #
208 | # Create a small script on the remote server that runs the job
209 | #
210 | main_file=$(remove_leading_slash "$main_file")
211 | cat >"$tmp_dir/run.sh" <<EOF
212 | # Run the PyCascading job
213 | cd "\$(dirname "\$0")/job"
214 | hadoop $hadoop_options jar "$server_build_dir/pycascading.jar" \\
215 | "$server_build_dir/bootstrap.py" hadoop "$server_build_dir" \\
216 | -a "$server_build_dir/pycascading.tgz" -a ../sources.tgz \\
217 | "$main_file" "\$@"
218 | EOF
219 | chmod +x "$tmp_dir/run.sh"
220 | 
221 | # Upload the package to the server and run the setup script
222 | cd "$tmp_dir"
223 | tar -c -z -h -f - . | ssh $server $ssh_options \
224 | "dir=\$(mktemp -d -t PyCascading-tmp-XXXXXX); cd \"\$dir\"; tar -x -z -f -; " \
225 | "./setup.sh $run_immediately \"\$@\"; rm -r \"\$dir\""
226 | rm -r "$tmp_dir"
227 | 


--------------------------------------------------------------------------------