├── .gitignore ├── LICENSE ├── NOTICE ├── README.md ├── add_jar_to_build.sh ├── add_tgz_to_build.sh ├── examples ├── README.md ├── cache.py ├── callback.py ├── copy_data_to_hdfs.sh ├── joins.py ├── map_types.py ├── merge_streams.py ├── pagerank.py ├── pycascading_data │ ├── graph.txt │ ├── lhs.txt │ ├── repeats.txt │ ├── rhs.txt │ └── town.txt ├── python_fields.py ├── reduce.py ├── subassembly.py ├── total_sort.py ├── udf_contexts.py └── word_count.py ├── java ├── build.xml ├── dependencies.properties └── src │ └── com │ └── twitter │ └── pycascading │ ├── CascadingAggregatorWrapper.java │ ├── CascadingBaseOperationWrapper.java │ ├── CascadingBufferWrapper.java │ ├── CascadingFilterWrapper.java │ ├── CascadingFunctionWrapper.java │ ├── CascadingRecordProducerWrapper.java │ ├── Main.java │ ├── MetaScheme.java │ ├── PythonEnvironment.java │ ├── PythonObjectInputStream.java │ ├── PythonObjectOutputStream.java │ ├── SelectFields.java │ ├── SerializedPythonFunction.java │ ├── TemporaryHdfs.java │ ├── Util.java │ ├── bigintegerserialization │ ├── BigIntegerComparator.java │ ├── BigIntegerDeserializer.java │ ├── BigIntegerSerialization.java │ └── BigIntegerSerializer.java │ └── pythonserialization │ ├── PythonDeserializer.java │ ├── PythonSerialization.java │ └── PythonSerializer.java ├── local_run.sh ├── python └── pycascading │ ├── __init__.py │ ├── bootstrap.py │ ├── cogroup.py │ ├── decorators.py │ ├── each.py │ ├── every.py │ ├── helpers.py │ ├── init_module.py │ ├── native.py │ ├── operators.py │ ├── pipe.py │ ├── serializers.py │ └── tap.py └── remote_deploy.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | build/* 3 | *.class 4 | *.jar 5 | *.pyc 6 | *~ 7 | examples/pycascading_data/out*/ 8 | examples/pycascading_data/maps/ 9 | examples/pycascading.cache/ 10 | .settings/ 11 | .classpath 12 | .project 13 | .pydevproject 14 | private/* 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | pycascading is a python wrapper for cascading. 2 | Copyright 2011 Twitter, Inc. 3 | 4 | This software has the follow third party dependencies: 5 | 6 | Jython 2.5.2 7 | http://www.jython.org/ 8 | Python Software Foundation License 2.0 9 | 10 | Cascading 1.2.4 11 | http://www.cascading.org/ 12 | GPL 2.0 13 | 14 | Hadoop 0.20.2 15 | http://hadoop.apache.org/ 16 | APL 2.0 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyCascading is no longer maintained 2 | =================================== 3 | 4 | PyCascading 5 | =========== 6 | 7 | PyCascading is a Python wrapper for Cascading. You can control the 8 | full data processing workflow from Python. 9 | 10 | * Pipelines are built with Python operators 11 | * User-defined functions are written in Python 12 | * Passing arbitrary contexts to user-defined functions 13 | * Caching of interim results in pipes for faster replay 14 | * Uses Jython 2.5.2, easy integration with Java and Python libraries 15 | 16 | 17 | Examples 18 | -------- 19 | 20 | There can't be a MapReduce tutorial without counting words. Here it is: 21 | 22 | def main(): 23 | ... 24 | 25 | @udf_map(produces=['word']) 26 | def split_words(tuple): 27 | for word in tuple.get('line').split(): 28 | yield [word] 29 | 30 | input | split_words | group_by('word', native.count()) | output 31 | ... 32 | 33 | Above, the user-defined function that reshapes the stream is annotated with 34 | a PyCascading decorator, and the workflow is created by chaining operations 35 | into each other. 36 | 37 | More examples for the different use cases can be found in the examples folder. 38 | See also the docstrings in the sources for a complete documentation of the 39 | arguments. 40 | 41 | To try the examples, first build the Java sources as described below in the 42 | Building section. Then, change to the 'examples' folder, and issue either 43 | 44 | ../local_run.sh example.py 45 | 46 | for a simulated Hadoop local run, or 47 | 48 | ../remote_deploy.sh -m -s hadoop_server example.py 49 | 50 | to deploy automatically on a Hadoop server. hadoop_server is the SSH address 51 | of an account where the master jar and script will be scp'd to. Note that the 52 | '-m' option has to be used only once in the beginning. The '-m' option copies 53 | the master jar to the server, and any subsequent deploys will use this master 54 | jar, and only the actual Python script will be copied over the network. 55 | 56 | 57 | Usage 58 | ----- 59 | 60 | PyCascading may be used in one of two modes: in local Hadoop mode or with 61 | remote Hadoop deployment. Please note that you need to specify the locations 62 | of the dependencies in the java/dependencies.properties file. 63 | 64 | In *local mode*, the script is executed in Hadoop's local mode. All files 65 | reside on the local file system, and creating a bundled deployment jar is not 66 | necessary. 67 | 68 | To run in this mode, use the script *local_run.sh*, with the first parameter 69 | being the PyCascading script. Additional command line parameters may be used 70 | to pass on to the script. 71 | 72 | In *Hadoop mode*, we assume that Hadoop runs on a remote SSH server (or 73 | localhost). First, a master jar is built and copied to the server. This jar 74 | contains all the PyCascading classes and other dependencies (but not Hadoop) 75 | needed to run a job, and may get rather large if there are a few external jars 76 | included. For this reason it is copied to the Hadoop deployment server only 77 | once, and whenever a new PyCascading script is run by the user, only the 78 | Pythn script is copied to the remote server and bundled there for submission 79 | to Hadoop. The first few variables in the remote_deploy.sh script specify 80 | the Hadoop server and the folders where the deployment files should be placed. 81 | 82 | Use the remote_deploy.sh script to deploy a PyCascading script to the remote 83 | Hadoop server. 84 | 85 | 86 | Building 87 | -------- 88 | 89 | Requirements for building: 90 | 91 | * Cascading 1.2.* or 2.0.0 (http://www.concurrentinc.com/downloads/) 92 | * Jython 2.5.2+ (http://www.jython.org/downloads.html) 93 | * Hadoop 0.20.2+, the version preferably matching the Hadoop runtime 94 | (http://www.apache.org/dyn/closer.cgi/hadoop/common/) 95 | * A Java compiler 96 | * Ant (http://ant.apache.org/) 97 | 98 | Requirements for running: 99 | 100 | * Hadoop installed and set up on the target server (http://hadoop.apache.org/) 101 | * SSH access to the remote server 102 | * If testing scripts locally, a reasonable JVM callable by "java" 103 | 104 | PyCascading consists of Java and Python sources. Python sources need no 105 | compiling, but the Java part needs to be built with Ant. For this, change to 106 | the 'java' folder, and invoke ant. This should build the sources and create 107 | a master jar for job submission. 108 | 109 | The locations of the Jython, Cascading, and Hadoop folders on the file system 110 | are specified in the java/dependencies.properties file. You need to correctly 111 | specify these before compiling the source. 112 | 113 | Also, check the remote_deploy.sh script and the locations defined in the 114 | beginning of that file on where to put the jar files on the Hadoop server. 115 | 116 | 117 | Bugs 118 | ---- 119 | 120 | Have a bug or feature request? Please create an issue here on GitHub! 121 | 122 | https://github.com/twitter/pycascading/issues 123 | 124 | 125 | Mailing list 126 | ------------ 127 | 128 | Currently we are using the cascading-user mailing list for discussions. Any 129 | questions, please ask there. 130 | 131 | http://groups.google.com/group/cascading-user 132 | 133 | 134 | Authors 135 | ------- 136 | 137 | **Gabor Szabo** 138 | 139 | + http://twitter.com/gaborjszabo 140 | 141 | License 142 | --------------------- 143 | 144 | Copyright 2011 Twitter, Inc. 145 | 146 | Licensed under the Apache License, Version 2.0 147 | 148 | http://www.apache.org/licenses/LICENSE-2.0 149 | -------------------------------------------------------------------------------- /add_jar_to_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright 2011 Twitter, Inc. 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # 19 | # Extracts a jar file and adds its contents to the PyCascading jar build. 20 | # 21 | # We need to extract the jar's contents as we expect that it may contain 22 | # further jars, which would not be picked up if we didn't extract the 23 | # whole jar. 24 | # 25 | 26 | usage() 27 | { 28 | cat << EOF 29 | Usage: $0 [ ...] 30 | 31 | Adds the jar files to the main PyCascading jar. This is useful if we have our 32 | own or third party libraries that the PyCascading scripts use, and want to 33 | distribute these to the Hadoop server together with the PyCascading master jar. 34 | 35 | The jar files can contain Java classes, further jars, and Python libraries. 36 | The Java classes should be in folders corresponding to their namespaces, as 37 | usual for jar files. The other Java library jars must be in a \'lib\' folder in 38 | the jar, and the Python imports must be in a \'python\' folder. 39 | 40 | The MANIFEST file, if present, will be discarded. 41 | 42 | Obviously, this script must be run after every new build of PyCascading for all 43 | the jars that should be added to the PyCascading build. 44 | 45 | EOF 46 | } 47 | 48 | if [ $# -eq 0 ]; then 49 | usage 50 | exit 51 | fi 52 | 53 | home_dir=$(pwd) 54 | pycascading_dir=$(dirname "$0") 55 | 56 | for j in "$@"; do 57 | temp=$(mktemp -d -t PyCascading-tmp-XXXXXX) 58 | cat "$j" | (cd "$temp"; jar x) 59 | rm -rf "$temp/META-INF/MANIFEST.MF" 2>/dev/null 60 | jar -uf "$pycascading_dir/build/pycascading.jar" -C "$temp" . 61 | rm -rf "$temp" 62 | done 63 | -------------------------------------------------------------------------------- /add_tgz_to_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright 2011 Twitter, Inc. 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # 19 | # Extracts a jar file and adds its contents to the PyCascading jar build. 20 | # 21 | # We need to extract the jar's contents as we expect that it may contain 22 | # further jars, which would not be picked up if we didn't extract the 23 | # whole jar. 24 | # 25 | 26 | usage() 27 | { 28 | cat << EOF 29 | Usage: $0 [ ...] 30 | 31 | Adds the tgz files to the main PyCascading tgz. This is useful if we have our 32 | own or third party Python libraries that the PyCascading scripts use, and want to 33 | distribute these to the Hadoop server together with the PyCascading master tgz. 34 | 35 | The tgz files can contain Python libraries that will be added to the search path. 36 | 37 | Obviously, this script must be run after every new build of PyCascading for all 38 | the tgzs that should be added to the PyCascading build. 39 | 40 | EOF 41 | } 42 | 43 | if [ $# -eq 0 ]; then 44 | usage 45 | exit 46 | fi 47 | 48 | home_dir=$(pwd) 49 | pycascading_dir=$(dirname "$0") 50 | 51 | temp=$(mktemp -d -t PyCascading-tmp-XXXXXX) 52 | gzip -d <"$pycascading_dir/build/pycascading.tgz" >"$temp/pycascading.tar" 53 | for j in "$@"; do 54 | gzip -d <"$j" >"$temp/archive.tar" 55 | tar -A -f "$temp/pycascading.tar" "$temp/archive.tar" 56 | done 57 | gzip -c <"$temp/pycascading.tar" >"$pycascading_dir/build/pycascading.tgz" 58 | rm -rf "$temp" 59 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | PyCascading examples 2 | ==================== 3 | 4 | This folder showcases a number of features offered by Cascading and 5 | PyCascading. They use input files in the 'pycascading\_data' folder, so 6 | before running the examples, make sure that: 7 | 8 | * in local mode, you cd first to the examples/ directory (or wherever 9 | pycascading\_data/ is found), and use local\_run.sh to run the example like 10 | * in Hadoop mode, you copy the data folder to HDFS first by running 11 | copy\_data\_to\_hdfs.sh, or 12 | 13 | hadoop fs -put pycascading\_data pycascading\_data 14 | 15 | and then invoke remote\_deploy.sh 16 | -------------------------------------------------------------------------------- /examples/cache.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example showing how to use caches. 17 | 18 | A cache saves the result of an operation to a temporary folder, and running 19 | the same script again will take the data from the cached files, instead of 20 | executing the original pipe again. Try to run this job several times with 21 | different separators: after the first run, the checkpointed state will be 22 | used for subsequent runs. 23 | 24 | This is useful if we want to repeatedly run the script with modifications 25 | to parts that do not change the cached results. 26 | 27 | For this script, the first run will have two MR jobs, but any subsequent runs 28 | will only have one, as the 29 | """ 30 | 31 | import sys 32 | from pycascading.helpers import * 33 | 34 | 35 | @udf_map 36 | def find_lines_with_beginning(tuple, first_char): 37 | try: 38 | if tuple.get(1)[0] == first_char: 39 | return [tuple.get(1)] 40 | except: 41 | pass 42 | 43 | 44 | @udf_buffer 45 | def concat_all(group, tuples, separator): 46 | out = '' 47 | for tuple in tuples: 48 | try: 49 | out = out + tuple.get(0) + separator 50 | except: 51 | pass 52 | return [out] 53 | 54 | 55 | def main(): 56 | if len(sys.argv) < 2: 57 | print 'A character must be given as a command line argument for the ' \ 58 | 'separator character.' 59 | return 60 | 61 | flow = Flow() 62 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 63 | output = flow.tsv_sink('pycascading_data/out') 64 | 65 | # Select the lines beginning with 'A', and save this intermediate result 66 | # in the cache so that we can call the script several times with 67 | # different separator characters 68 | p = input | map_replace(find_lines_with_beginning('A'), 'line') 69 | # Checkpoint the results from 'p' into a cache folder named 'line_begins' 70 | # The caches are in the user's HDFS folder, under pycascading.cache/ 71 | p = flow.cache('line_begins') | p 72 | # Everything goes to one reducer 73 | p | group_by(Fields.VALUES, concat_all(sys.argv[1]), 'result') | output 74 | 75 | flow.run(num_reducers=1) 76 | -------------------------------------------------------------------------------- /examples/callback.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """ 17 | Contrived example showing that you can pass functions as args to a UDF. 18 | Also shows how to use keyword args (just the way it's expected). 19 | 20 | Thanks to ebernhardson. 21 | """ 22 | 23 | from pycascading.helpers import * 24 | 25 | 26 | def word_count_callback(value): 27 | return len(value.split()) 28 | 29 | 30 | @udf_map 31 | def word_count(tuple, inc, second_inc, callback=None): 32 | return [inc + second_inc + callback(tuple.get(1)), tuple.get(1)] 33 | 34 | 35 | def main(): 36 | flow = Flow() 37 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 38 | output = flow.tsv_sink('pycascading_data/out') 39 | 40 | p = input | map_replace( 41 | word_count(100, second_inc=200, callback=word_count_callback), 42 | ['word_count', 'line']) | output 43 | 44 | flow.run(num_reducers=1) 45 | -------------------------------------------------------------------------------- /examples/copy_data_to_hdfs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run this on the Hadoop server to copy the data files needed 4 | # to run the PyCascading examples to HDFS 5 | hadoop fs -put pycascading_data pycascading_data 6 | -------------------------------------------------------------------------------- /examples/joins.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example showing the joining and splitting of tuple streams.""" 17 | 18 | 19 | from pycascading.helpers import * 20 | 21 | 22 | @udf_map(produces=['ucase_lhs2', 'rhs2']) 23 | def upper_case(tuple): 24 | """Return the upper case of the 'lhs2' column, and the 'rhs2' column""" 25 | return [tuple.get('lhs2').upper(), tuple.get('rhs2')] 26 | 27 | 28 | def main(): 29 | flow = Flow() 30 | lhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 31 | [Integer, String]), 32 | 'pycascading_data/lhs.txt')) 33 | rhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 34 | [Integer, String]), 35 | 'pycascading_data/rhs.txt')) 36 | output1 = flow.tsv_sink('pycascading_data/out1') 37 | output2 = flow.tsv_sink('pycascading_data/out2') 38 | 39 | # Join on the first columns ('col1' for both) of lhs and rhs inputs 40 | # We need to use declared_fields if the field names since the field names 41 | # of the two pipes overlap 42 | p = (lhs & rhs) | inner_join(['col1', 'col1'], 43 | declared_fields=['lhs1', 'lhs2', 'rhs1', 'rhs2']) 44 | 45 | # Save the 2nd and 4th columns of p to output1 46 | p | retain('lhs2', 'rhs2') | output1 47 | 48 | # Join on the upper-cased first column of p and the 2nd column of rhs, 49 | # and save the output to output2 50 | ((p | upper_case) & (rhs | retain('col2'))) | \ 51 | inner_join(['ucase_lhs2', 'col2']) | output2 52 | 53 | flow.run(num_reducers=2) 54 | -------------------------------------------------------------------------------- /examples/map_types.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example illustrating the different types of map operations. 17 | 18 | In the output folders check the .pycascading_types and .pycascading_header 19 | files to see what the names of the fields were when the pipes were sinked. 20 | """ 21 | 22 | 23 | from pycascading.helpers import * 24 | 25 | 26 | def main(): 27 | flow = Flow() 28 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 29 | 30 | out_folder = 'pycascading_data/maps/' 31 | 32 | @udf(produces='word') 33 | def decorated_udf(tuple): 34 | for word in tuple.get('line').split(): 35 | yield [word] 36 | 37 | def undecorated_udf(tuple): 38 | for word in tuple.get('line').split(): 39 | yield [word] 40 | 41 | # This will create an output with one field called 'word', as the UDF 42 | # was declared with a 'produces' 43 | # In this case the swap swaps out the whole input tuple with the output 44 | input | map_replace(decorated_udf) | \ 45 | flow.tsv_sink(out_folder + 'decorated_udf') 46 | 47 | # This will create an output with one unnamed field, but otherwise the 48 | # same as the previous one 49 | input | map_replace(undecorated_udf) | \ 50 | flow.tsv_sink(out_folder + 'undecorated_udf') 51 | 52 | # This will only replace the first ('line') field with the output of 53 | # the map, but 'offset' will be retained 54 | # Note that once we add an unnamed field, all field names will be lost 55 | input | map_replace(1, undecorated_udf) | \ 56 | flow.tsv_sink(out_folder + 'undecorated_udf_with_input_args') 57 | 58 | # This will create one field only, 'word', just like the first example 59 | input | map_replace(undecorated_udf, 'word') | \ 60 | flow.tsv_sink(out_folder + 'undecorated_udf_with_output_fields') 61 | 62 | # This one will add the new column, 'word', to all lines 63 | input | map_add(decorated_udf) | \ 64 | flow.tsv_sink(out_folder + 'decorated_udf_all') 65 | 66 | # This produces the same output as the previous example 67 | input | map_add(1, undecorated_udf, 'word') | \ 68 | flow.tsv_sink(out_folder + 'undecorated_udf_all') 69 | 70 | flow.run(num_reducers=1) 71 | -------------------------------------------------------------------------------- /examples/merge_streams.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Merge two streams together. 17 | 18 | We are using Cascading GroupBy with multiple input streams to join them into 19 | one. The streams have to have the same field names and types. 20 | 21 | If the column names are different, Cascading won't even build the flow, 22 | however if the column types differ, the flow is run but most likely will fail 23 | due to different types not being comparable when grouping. 24 | """ 25 | 26 | from pycascading.helpers import * 27 | 28 | 29 | def main(): 30 | flow = Flow() 31 | stream1 = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 32 | [Integer, String]), 33 | 'pycascading_data/lhs.txt')) 34 | stream2 = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 35 | [Integer, String]), 36 | 'pycascading_data/rhs.txt')) 37 | output = flow.tsv_sink('pycascading_data/out') 38 | 39 | (stream1 & stream2) | group_by() | output 40 | 41 | flow.run(num_reducers=1) 42 | -------------------------------------------------------------------------------- /examples/pagerank.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Calculates PageRank for a given graph. 17 | 18 | We assume that there are no dangling pages with no outgoing links. 19 | """ 20 | 21 | import os 22 | from pycascading.helpers import * 23 | 24 | 25 | def test(graph_file, d, iterations): 26 | """This is the Python implementation of PageRank.""" 27 | in_links = {} 28 | out_degree = {} 29 | pagerank = {} 30 | file = open(graph_file) 31 | for line in file: 32 | (source, dest) = line.rstrip().split() 33 | try: 34 | in_links[dest].add(source) 35 | except KeyError: 36 | in_links[dest] = set(source) 37 | try: 38 | out_degree[source] += 1 39 | except KeyError: 40 | out_degree[source] = 1 41 | pagerank[source] = 1.0 42 | pagerank[dest] = 1.0 43 | file.close() 44 | old_pr = pagerank 45 | new_pr = {} 46 | for iteration in xrange(0, iterations): 47 | for node in old_pr: 48 | new_pr[node] = (1 - d) 49 | try: 50 | new_pr[node] += \ 51 | d * sum([old_pr[n] / out_degree[n] for n in in_links[node]]) 52 | except KeyError: 53 | pass 54 | tmp = old_pr 55 | old_pr = new_pr 56 | new_pr = tmp 57 | return old_pr 58 | 59 | 60 | def main(): 61 | """The PyCascading job.""" 62 | # The damping factor 63 | d = 0.85 64 | # The number of iterations 65 | iterations = 5 66 | 67 | # The directed, unweighted graph in a space-separated file, in 68 | # format 69 | graph_file = 'pycascading_data/graph.txt' 70 | 71 | graph_source = Hfs(TextDelimited(Fields(['from', 'to']), ' ', 72 | [String, String]), graph_file) 73 | 74 | out_links_file = 'pycascading_data/out/pagerank/out_links' 75 | pr_values_1 = 'pycascading_data/out/pagerank/iter1' 76 | pr_values_2 = 'pycascading_data/out/pagerank/iter2' 77 | 78 | # Some setup here: we'll need the ougoing degree of nodes, and we will 79 | # initialize the pageranks of nodes to 1.0 80 | flow = Flow() 81 | graph = flow.source(graph_source) 82 | 83 | # Count the number of outgoing links for every node that is a source, 84 | # and store it in a field called 'out_degree' 85 | graph | group_by('from') | native.count('out_degree') | \ 86 | flow.binary_sink(out_links_file) 87 | 88 | # Initialize the pageranks of all nodes to 1.0 89 | # This file has fields 'node' and 'pagerank', and is stored to pr_values_1 90 | @udf 91 | def constant(tuple, c): 92 | """Just a field with a constant value c.""" 93 | yield [c] 94 | @udf 95 | def both_nodes(tuple): 96 | """For each link returns both endpoints.""" 97 | yield [tuple.get(0)] 98 | yield [tuple.get(1)] 99 | graph | map_replace(both_nodes, 'node') | \ 100 | native.unique(Fields.ALL) | map_add(constant(1.0), 'pagerank') | \ 101 | flow.binary_sink(pr_values_1) 102 | 103 | flow.run(num_reducers=1) 104 | 105 | pr_input = pr_values_1 106 | pr_output = pr_values_2 107 | for iteration in xrange(0, iterations): 108 | flow = Flow() 109 | 110 | graph = flow.source(graph_source) 111 | pageranks = flow.meta_source(pr_input) 112 | out_links = flow.meta_source(out_links_file) 113 | 114 | # Decorate the graph's source nodes with their pageranks and the 115 | # number of their outgoing links 116 | # We could have joined graph & out_links outside of the loop, but 117 | # in order to demonstrate joins with multiple streams, we do it here 118 | p = (graph & pageranks & (out_links | rename('from', 'from_out'))) | \ 119 | inner_join(['from', 'node', 'from_out']) | \ 120 | rename(['pagerank', 'out_degree'], ['from_pagerank', 'from_out_degree']) | \ 121 | retain('from', 'from_pagerank', 'from_out_degree', 'to') 122 | 123 | # Distribute the sources' pageranks to their out-neighbors equally 124 | @udf 125 | def incremental_pagerank(tuple, d): 126 | yield [d * tuple.get('from_pagerank') / tuple.get('from_out_degree')] 127 | p = p | map_replace(['from', 'from_pagerank', 'from_out_degree'], 128 | incremental_pagerank(d), 'incr_pagerank') | \ 129 | rename('to', 'node') | retain('node', 'incr_pagerank') 130 | 131 | # Add the constant jump probability to all the pageranks that come 132 | # from the in-links 133 | p = (p & (pageranks | map_replace('pagerank', constant(1.0 - d), 'incr_pagerank'))) | group_by() 134 | p = p | group_by('node', 'incr_pagerank', native.sum('pagerank')) 135 | 136 | if iteration == iterations - 1: 137 | # Only store the final result in a TSV file 138 | p | flow.tsv_sink(pr_output) 139 | else: 140 | # Store intermediate results in a binary format for faster IO 141 | p | flow.binary_sink(pr_output) 142 | 143 | # Swap the input and output folders for the next iteration 144 | tmp = pr_input 145 | pr_input = pr_output 146 | pr_output = tmp 147 | 148 | flow.run(num_reducers=1) 149 | 150 | print 'Results from PyCascading:', pr_input 151 | os.system('cat %s/.pycascading_header %s/part*' % (pr_input, pr_input)) 152 | 153 | print 'The test values:' 154 | test_pr = test(graph_file, d, iterations) 155 | print 'node\tpagerank' 156 | for n in sorted(test_pr.iterkeys()): 157 | print '%s\t%g' % (n, test_pr[n]) 158 | -------------------------------------------------------------------------------- /examples/pycascading_data/graph.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 3 3 | 1 4 4 | 2 4 5 | 3 4 6 | 4 2 7 | -------------------------------------------------------------------------------- /examples/pycascading_data/lhs.txt: -------------------------------------------------------------------------------- 1 | 1 a 2 | 1 b 3 | 1 c 4 | 2 b 5 | 2 c 6 | 2 d 7 | 3 c 8 | 4 b 9 | 4 c 10 | 4 d 11 | 5 a 12 | 5 b 13 | 5 e 14 | -------------------------------------------------------------------------------- /examples/pycascading_data/repeats.txt: -------------------------------------------------------------------------------- 1 | a 1 2 | b 2 3 | c 3 4 | b 2 5 | a 1 6 | a 1 7 | c 3 8 | b 2 9 | a 1 10 | -------------------------------------------------------------------------------- /examples/pycascading_data/rhs.txt: -------------------------------------------------------------------------------- 1 | 1 A 2 | 1 B 3 | 1 C 4 | 2 B 5 | 2 C 6 | 2 D 7 | 3 C 8 | 4 B 9 | 4 C 10 | 4 D 11 | 5 A 12 | 5 B 13 | 5 E 14 | -------------------------------------------------------------------------------- /examples/pycascading_data/town.txt: -------------------------------------------------------------------------------- 1 | There's many a strong farmer 2 | Whose heart would break in two, 3 | If he could see the townland 4 | That we are riding to; 5 | Boughs have their fruit and blossom 6 | At all times of the year; 7 | Rivers are running over 8 | With red beer and brown beer. 9 | An old man plays the bagpipes 10 | In a golden and silver wood; 11 | Queens, their eyes blue like the ice, 12 | Are dancing in a crowd. 13 | 14 | The little fox he murmured, 15 | 'O what of the world's bane?' 16 | The sun was laughing sweetly, 17 | The moon plucked at my rein; 18 | But the little red fox murmured, 19 | 'O do not pluck at his rein, 20 | He is riding to the townland 21 | That is the world's bane.' 22 | 23 | When their hearts are so high 24 | That they would come to blows, 25 | They unhook their heavy swords 26 | From golden and silver boughs; 27 | But all that are killed in battle 28 | Awaken to life again. 29 | It is lucky that their story 30 | Is not known among men, 31 | For O, the strong farmers 32 | That would let the spade lie, 33 | Their hearts would be like a cup 34 | That somebody had drunk dry. 35 | 36 | The little fox he murmured, 37 | 'O what of the world's bane?' 38 | The sun was laughing sweetly, 39 | The moon plucked at my rein; 40 | But the little red fox murmured, 41 | 'O do not pluck at his rein, 42 | He is riding to the townland 43 | That is the world's bane.' 44 | 45 | Michael will unhook his trumpet 46 | From a bough overhead, 47 | And blow a little noise 48 | When the supper has been spread. 49 | Gabriel will come from the water 50 | With a fish-tail, and talk 51 | Of wonders that have happened 52 | On wet roads where men walk. 53 | And lift up an old horn 54 | Of hammered silver, and drink 55 | Till he has fallen asleep 56 | Upon the starry brink. 57 | 58 | The little fox he murmured, 59 | 'O what of the world's bane?' 60 | The sun was laughing sweetly, 61 | The moon plucked at my rein; 62 | But the little red fox murmured. 63 | 'O do not pluck at his rein, 64 | He is riding to the townland 65 | That is the world's bane.' 66 | -------------------------------------------------------------------------------- /examples/python_fields.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example demonstrating the use of arbitrary Python (or Java) data in tuples. 17 | 18 | The fields have to implement Serializable. 19 | 20 | Currently these fields cannot be joined on, since we do not want to 21 | deserialize them for each comparison. We are also doing a join here to test 22 | the serializers. 23 | 24 | Note that the serialization is currently done using the standard Java 25 | serialization framework, and thus is slow and produces large blobs. There are 26 | plans to use more efficient serializers in the future. 27 | """ 28 | 29 | 30 | from pycascading.helpers import * 31 | 32 | 33 | @udf_map(produces=['col1', 'col2', 'info']) 34 | def add_python_data(tuple): 35 | """This function returns a Python data structure as well.""" 36 | return [ tuple.get(0), tuple.get(1), [ 'first', { 'key' : 'value' } ]] 37 | 38 | 39 | def main(): 40 | flow = Flow() 41 | lhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 42 | [Integer, String]), 43 | 'pycascading_data/lhs.txt')) 44 | rhs = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 45 | [Integer, String]), 46 | 'pycascading_data/rhs.txt')) 47 | 48 | ((lhs | add_python_data()) & rhs) | inner_join(['col1', 'col1'], 49 | declared_fields=['lhs1', 'lhs2', 'info', 'rhs1', 'rhs2']) | \ 50 | flow.tsv_sink('pycascading_data/out') 51 | 52 | flow.run(num_reducers=2) 53 | -------------------------------------------------------------------------------- /examples/reduce.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example showing how to use filters and buffers. 17 | 18 | A buffer UDF is similar to the built-in Python reduce function. It takes a 19 | group of tuples that have been previously grouped by group_by, and yields an 20 | arbitrary number of new tuples for the group (it is most useful though to do 21 | some aggregation on the group). The tuples are fetched using an iterator. 22 | """ 23 | 24 | from pycascading.helpers import * 25 | 26 | 27 | @udf_filter 28 | def starts_with_letter(tuple, letter): 29 | try: 30 | return tuple.get(1)[0].upper() == letter 31 | except: 32 | return False 33 | 34 | 35 | @udf_map 36 | def word_count(tuple): 37 | return [len(tuple.get(1).split()), tuple.get(1)] 38 | 39 | 40 | def main(): 41 | flow = Flow() 42 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 43 | output = flow.tsv_sink('pycascading_data/out') 44 | 45 | p = input | filter_by(starts_with_letter('A')) | \ 46 | map_replace(word_count(), ['word_count', 'line']) 47 | 48 | @udf_buffer(produces=['word_count', 'count', 'first_chars']) 49 | def count(group, tuples): 50 | """Counts the number of tuples in the group, and also emits a string 51 | that is the first character of the 'line' column repeated this many 52 | times.""" 53 | c = 0 54 | first_char = '' 55 | for tuple in tuples: 56 | c += 1 57 | first_char += tuple.get('line')[0] 58 | yield [group.get(0), c, first_char] 59 | 60 | p | group_by('word_count', count()) | output 61 | 62 | flow.run(num_reducers=2) 63 | -------------------------------------------------------------------------------- /examples/subassembly.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example demonstrating the use of predefined subassemblies. 17 | 18 | Useful aggregators, subassemblies, pipes available in Cascading are imported 19 | into PyCascading by native.py 20 | """ 21 | 22 | from pycascading.helpers import * 23 | 24 | 25 | def main(): 26 | flow = Flow() 27 | repeats = flow.source(Hfs(TextDelimited(Fields(['col1', 'col2']), ' ', 28 | [String, Integer]), 29 | 'pycascading_data/repeats.txt')) 30 | output = flow.tsv_sink('pycascading_data/out') 31 | 32 | # This selects the distinct records considering all fields 33 | repeats | native.unique(Fields.ALL) | output 34 | 35 | flow.run() 36 | -------------------------------------------------------------------------------- /examples/total_sort.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Simple word count example with reverse sorting of the words by frequency.""" 17 | 18 | from pycascading.helpers import * 19 | 20 | 21 | def main(): 22 | flow = Flow() 23 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 24 | output = flow.tsv_sink('pycascading_data/out') 25 | 26 | @udf_map 27 | def split_words(tuple): 28 | for word in tuple.get(1).split(): 29 | yield [word] 30 | 31 | input | \ 32 | map_replace(split_words, 'word') | \ 33 | group_by('word') | \ 34 | native.count() | \ 35 | group_by(Fields.VALUES, sort_fields=['count'], reverse_order=True) | \ 36 | output 37 | 38 | flow.run(num_reducers=5) 39 | -------------------------------------------------------------------------------- /examples/udf_contexts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Example showing how to pass in parameters to UDFs. 17 | 18 | The context is serialized and shipped to where the UDFs are executed. A use 19 | case for example is to perform replicated joins on constant data. 20 | """ 21 | 22 | from pycascading.helpers import * 23 | 24 | 25 | def main(): 26 | flow = Flow() 27 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 28 | output = flow.tsv_sink('pycascading_data/out') 29 | 30 | @udf_filter 31 | def starts_with_letters(tuple, field, letters): 32 | """Only let tuples through whose second field starts with a given letter. 33 | 34 | The set of acceptable initial letters is passed in the letters parameter, 35 | and is defined at the time when we build the flow. 36 | """ 37 | try: 38 | return tuple.get(field)[0].upper() in letters 39 | except: 40 | return False 41 | 42 | # Retain only lines that start with an 'A' or 'T' 43 | input | retain('line') | starts_with_letters(0, set(['A', 'T'])) | output 44 | 45 | flow.run(num_reducers=2) 46 | -------------------------------------------------------------------------------- /examples/word_count.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Simple word count example.""" 17 | 18 | from pycascading.helpers import * 19 | 20 | 21 | @udf_map(produces=['word']) 22 | def split_words(tuple): 23 | """The function to split the line and return several new tuples. 24 | 25 | The tuple to operate on is passed in as the first parameter. We are 26 | yielding the results in a for loop back. Each word becomes the only field 27 | in a new tuple stream, and the string to be split is the 2nd field of the 28 | input tuple. 29 | """ 30 | for word in tuple.get(1).split(): 31 | yield [word] 32 | 33 | 34 | def main(): 35 | flow = Flow() 36 | # The TextLine() scheme produces tuples where the first field is the 37 | # offset of the line in the file, and the second is the line as a string. 38 | input = flow.source(Hfs(TextLine(), 'pycascading_data/town.txt')) 39 | output = flow.tsv_sink('pycascading_data/out') 40 | 41 | input | split_words | group_by('word', native.count()) | output 42 | 43 | flow.run(num_reducers=2) 44 | -------------------------------------------------------------------------------- /java/build.xml: -------------------------------------------------------------------------------- 1 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 65 | 66 | 67 | 68 | 69 | 70 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /java/dependencies.properties: -------------------------------------------------------------------------------- 1 | # The folder where Cascading was downloaded to 2 | # http://www.concurrentinc.com/downloads/ 3 | cascading=/opt/cascading-1.2.5-hadoop-0.19.2+ 4 | 5 | # At least Jython version 2.5.2 required 6 | # Download from http://www.jython.org/downloads.html 7 | jython=/opt/jython 8 | 9 | # Hadoop's folder 10 | # Download from http://www.apache.org/dyn/closer.cgi/hadoop/common/ 11 | hadoop=/opt/hadoop-0.20.203.0 12 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/CascadingAggregatorWrapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.Serializable; 18 | 19 | import cascading.flow.FlowProcess; 20 | import cascading.operation.Aggregator; 21 | import cascading.operation.AggregatorCall; 22 | import cascading.tuple.Fields; 23 | import cascading.tuple.TupleEntry; 24 | import cascading.tuple.TupleEntryCollector; 25 | 26 | /** 27 | * Wrapper for a Cascading Aggregator that calls a Python function. 28 | * TODO: we don't really need this, as Buffers are just as good as Aggregators 29 | * 30 | * @author Gabor Szabo 31 | */ 32 | @SuppressWarnings("rawtypes") 33 | public class CascadingAggregatorWrapper extends CascadingRecordProducerWrapper implements 34 | Aggregator, Serializable { 35 | private static final long serialVersionUID = -5110929817978998473L; 36 | 37 | public CascadingAggregatorWrapper() { 38 | super(); 39 | } 40 | 41 | public CascadingAggregatorWrapper(Fields fieldDeclaration) { 42 | super(fieldDeclaration); 43 | } 44 | 45 | public CascadingAggregatorWrapper(int numArgs) { 46 | super(numArgs); 47 | } 48 | 49 | public CascadingAggregatorWrapper(int numArgs, Fields fieldDeclaration) { 50 | super(numArgs, fieldDeclaration); 51 | } 52 | 53 | @Override 54 | public void start(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 55 | // TODO Auto-generated method stub 56 | System.out.println("Aggregator start called"); 57 | } 58 | 59 | @Override 60 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 61 | TupleEntry group = aggregatorCall.getGroup(); 62 | TupleEntryCollector outputCollector = aggregatorCall.getOutputCollector(); 63 | 64 | System.out.println("Aggregator called with group: " + group); 65 | } 66 | 67 | @Override 68 | public void complete(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 69 | // TODO Auto-generated method stub 70 | System.out.println("Aggregator complete called"); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/CascadingBufferWrapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.IOException; 18 | import java.io.ObjectInputStream; 19 | import java.io.Serializable; 20 | import java.util.Iterator; 21 | 22 | import org.python.core.Py; 23 | 24 | import cascading.flow.FlowProcess; 25 | import cascading.operation.Buffer; 26 | import cascading.operation.BufferCall; 27 | import cascading.tuple.Fields; 28 | import cascading.tuple.TupleEntry; 29 | import cascading.tuple.TupleEntryCollector; 30 | 31 | /** 32 | * Wrapper for a Cascading Buffer that calls a Python function. 33 | * 34 | * @author Gabor Szabo 35 | */ 36 | @SuppressWarnings("rawtypes") 37 | public class CascadingBufferWrapper extends CascadingRecordProducerWrapper implements Buffer, 38 | Serializable { 39 | private static final long serialVersionUID = -3512295576396796360L; 40 | 41 | public CascadingBufferWrapper() { 42 | super(); 43 | } 44 | 45 | public CascadingBufferWrapper(Fields fieldDeclaration) { 46 | super(fieldDeclaration); 47 | } 48 | 49 | public CascadingBufferWrapper(int numArgs) { 50 | super(numArgs); 51 | } 52 | 53 | public CascadingBufferWrapper(int numArgs, Fields fieldDeclaration) { 54 | super(numArgs, fieldDeclaration); 55 | } 56 | 57 | private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { 58 | setupArgs(); 59 | } 60 | 61 | public int getNumParameters() { 62 | return super.getNumParameters() + 1; 63 | } 64 | 65 | @Override 66 | public void operate(FlowProcess flowProcess, BufferCall bufferCall) { 67 | // TODO: if the Python buffer expects Python dicts or lists, then we need to 68 | // convert the Iterator 69 | @SuppressWarnings("unchecked") 70 | Iterator arguments = bufferCall.getArgumentsIterator(); 71 | 72 | // This gets called even when there are no tuples in the group after 73 | // a GroupBy (see the Buffer javadoc). So we need to check if there are any 74 | // valid tuples returned in the group. 75 | if (arguments.hasNext()) { 76 | TupleEntry group = bufferCall.getGroup(); 77 | TupleEntryCollector outputCollector = bufferCall.getOutputCollector(); 78 | 79 | callArgs[0] = Py.java2py(group); 80 | callArgs[1] = Py.java2py(arguments); 81 | if (outputMethod == OutputMethod.COLLECTS) { 82 | callArgs[2] = Py.java2py(outputCollector); 83 | callFunction(); 84 | } else { 85 | Object ret = callFunction(); 86 | collectOutput(outputCollector, ret); 87 | } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/CascadingFilterWrapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.ObjectInputStream; 18 | import java.io.Serializable; 19 | 20 | import org.python.core.Py; 21 | import org.python.core.PyObject; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.Filter; 25 | import cascading.operation.FilterCall; 26 | import cascading.tuple.Fields; 27 | 28 | /** 29 | * Wrapper for a Cascading Filter that calls a Python function. 30 | * 31 | * @author Gabor Szabo 32 | */ 33 | @SuppressWarnings("rawtypes") 34 | public class CascadingFilterWrapper extends CascadingBaseOperationWrapper implements Filter, 35 | Serializable { 36 | private static final long serialVersionUID = -8825679328970045134L; 37 | 38 | public CascadingFilterWrapper() { 39 | super(); 40 | } 41 | 42 | public CascadingFilterWrapper(Fields fieldDeclaration) { 43 | // If we set it to anything other than Fields.ALL, Cascading complains 44 | super(Fields.ALL); 45 | } 46 | 47 | public CascadingFilterWrapper(int numArgs) { 48 | super(numArgs); 49 | } 50 | 51 | public CascadingFilterWrapper(int numArgs, Fields fieldDeclaration) { 52 | super(numArgs, fieldDeclaration); 53 | } 54 | 55 | public int getNumParameters() { 56 | return 1; 57 | } 58 | 59 | private void readObject(ObjectInputStream stream) { 60 | setupArgs(); 61 | } 62 | 63 | @Override 64 | public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) { 65 | Object tuple = convertInput(filterCall.getArguments()); 66 | callArgs[0] = Py.java2py(tuple); 67 | PyObject ret = callFunction(); 68 | return !Py.py2boolean(ret); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/CascadingFunctionWrapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.ObjectInputStream; 18 | import java.io.Serializable; 19 | 20 | import org.python.core.Py; 21 | 22 | import cascading.flow.FlowProcess; 23 | import cascading.operation.Function; 24 | import cascading.operation.FunctionCall; 25 | import cascading.operation.OperationCall; 26 | import cascading.tuple.Fields; 27 | import cascading.tuple.TupleEntryCollector; 28 | 29 | /** 30 | * Wrapper for a Cascading Function that calls a Python function. 31 | * 32 | * @author Gabor Szabo 33 | */ 34 | @SuppressWarnings("rawtypes") 35 | public class CascadingFunctionWrapper extends CascadingRecordProducerWrapper implements Function, 36 | Serializable { 37 | private static final long serialVersionUID = -3512295576396796360L; 38 | 39 | public CascadingFunctionWrapper() { 40 | super(); 41 | } 42 | 43 | public CascadingFunctionWrapper(Fields fieldDeclaration) { 44 | super(fieldDeclaration); 45 | } 46 | 47 | public CascadingFunctionWrapper(int numArgs) { 48 | super(numArgs); 49 | } 50 | 51 | public CascadingFunctionWrapper(int numArgs, Fields fieldDeclaration) { 52 | super(numArgs, fieldDeclaration); 53 | } 54 | 55 | /** 56 | * We need to call setupArgs() from here, otherwise CascadingFunctionWrapper 57 | * is not initialized yet if we call it from CascadingBaseOperationWrapper. 58 | */ 59 | private void readObject(ObjectInputStream stream) { 60 | setupArgs(); 61 | } 62 | 63 | @Override 64 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 65 | super.prepare(flowProcess, operationCall); 66 | } 67 | 68 | @Override 69 | public void operate(FlowProcess flowProcess, FunctionCall functionCall) { 70 | Object inputTuple = convertInput(functionCall.getArguments()); 71 | TupleEntryCollector outputCollector = functionCall.getOutputCollector(); 72 | 73 | callArgs[0] = Py.java2py(inputTuple); 74 | if (outputMethod == OutputMethod.COLLECTS) { 75 | // The Python function collects the output tuples itself into the output 76 | // collector 77 | callArgs[1] = Py.java2py(outputCollector); 78 | callFunction(); 79 | } else { 80 | // The Python function yields or returns records 81 | Object ret = callFunction(); 82 | collectOutput(outputCollector, ret); 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/CascadingRecordProducerWrapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.Serializable; 18 | 19 | import org.python.core.PyGenerator; 20 | import org.python.core.PyNone; 21 | import org.python.core.PyObject; 22 | import org.python.core.PySequenceList; 23 | 24 | import cascading.tuple.Fields; 25 | import cascading.tuple.Tuple; 26 | import cascading.tuple.TupleEntry; 27 | import cascading.tuple.TupleEntryCollector; 28 | 29 | /** 30 | * This class is the parent class for Cascading Functions and Buffers. It 31 | * essetially converts records coming from the Python function to tuples. 32 | * 33 | * @author Gabor Szabo 34 | */ 35 | public class CascadingRecordProducerWrapper extends CascadingBaseOperationWrapper implements 36 | Serializable { 37 | private static final long serialVersionUID = -1198203231681047370L; 38 | 39 | // This is how the Python function returns the output tuples. It can add them 40 | // to the output collector right away, provide a generator to yield one or 41 | // more records, or return one record only. YIELDS_OR_RETURNS means that 42 | // PyCascading should determine automatically if it's a generator or a normal 43 | // function. 44 | public enum OutputMethod { 45 | COLLECTS, YIELDS, RETURNS, YIELDS_OR_RETURNS 46 | } 47 | 48 | // This is what the Python function returns: a Python list or a Cascading 49 | // tuple, or PyCascading can also figure it out automatically from the first 50 | // record returned. 51 | // 52 | // AUTO means that the type of the very first object returned from the 53 | // Python @map determines what type we are going to use. 54 | public enum OutputType { 55 | AUTO, PYTHON_LIST, TUPLE, TUPLEENTRY 56 | } 57 | 58 | protected OutputMethod outputMethod; 59 | protected OutputType outputType; 60 | 61 | public CascadingRecordProducerWrapper() { 62 | super(); 63 | } 64 | 65 | public CascadingRecordProducerWrapper(Fields fieldDeclaration) { 66 | super(fieldDeclaration); 67 | } 68 | 69 | public CascadingRecordProducerWrapper(int numArgs) { 70 | super(numArgs); 71 | } 72 | 73 | public CascadingRecordProducerWrapper(int numArgs, Fields fieldDeclaration) { 74 | super(numArgs, fieldDeclaration); 75 | } 76 | 77 | public int getNumParameters() { 78 | return (outputMethod == OutputMethod.COLLECTS ? 2 : 1); 79 | } 80 | 81 | /** 82 | * Cast the returned or yielded array to a Tuple, and add it to the output 83 | * collector. 84 | * 85 | * @param ret 86 | * the object (list) returned from the Python function 87 | * @param outputCollector 88 | * the output collector in which we place the Tuple 89 | * @param simpleCastIfTuple 90 | * if we can simply cast ret to a Tuple, or have to call Jython's 91 | * casting 92 | */ 93 | private void castPythonObject(Object ret, TupleEntryCollector outputCollector, 94 | boolean simpleCastIfTuple) { 95 | if (outputType == OutputType.AUTO) { 96 | // We need to determine the type of the record now 97 | if (PySequenceList.class.isInstance(ret)) 98 | outputType = OutputType.PYTHON_LIST; 99 | else if (Tuple.class.isInstance(ret)) 100 | outputType = OutputType.TUPLE; 101 | else if (TupleEntry.class.isInstance(ret)) 102 | outputType = OutputType.TUPLEENTRY; 103 | else 104 | throw new RuntimeException( 105 | "Python function must return a list, Tuple, or TupleEnty. We got: " 106 | + ret.getClass()); 107 | } 108 | if (outputType == OutputType.PYTHON_LIST) 109 | // Convert the returned Python list to a tuple 110 | // We can return both a Python (immutable) tuple and a list, so we 111 | // need to use their common superclass, PySequenceList. 112 | try { 113 | outputCollector.add(new Tuple(((PySequenceList) ret).toArray())); 114 | } catch (ClassCastException e) { 115 | throw new RuntimeException( 116 | "Python function or generator must return a Python list, we got " + ret.getClass() 117 | + " instead"); 118 | } 119 | else if (outputType == OutputType.TUPLE) { 120 | try { 121 | // For some reason yield doesn't wrap the object in a Jython 122 | // container, but return does 123 | if (simpleCastIfTuple) 124 | outputCollector.add((Tuple) ret); 125 | else 126 | outputCollector.add((Tuple) ((PyObject) ret).__tojava__(Tuple.class)); 127 | } catch (ClassCastException e) { 128 | throw new RuntimeException( 129 | "Python function or generator must return a Cascading Tuple, we got " 130 | + ret.getClass() + " instead"); 131 | } 132 | } else { 133 | try { 134 | outputCollector.add((TupleEntry) ((PyObject) ret).__tojava__(TupleEntry.class)); 135 | } catch (ClassCastException e) { 136 | throw new RuntimeException( 137 | "Python function or generator must return a Cascading TupleEntry, we got " 138 | + ret.getClass() + " instead"); 139 | } 140 | } 141 | } 142 | 143 | protected void collectOutput(TupleEntryCollector outputCollector, Object ret) { 144 | if (ret == null) 145 | return; 146 | if (outputMethod == OutputMethod.YIELDS_OR_RETURNS) { 147 | // Determine automatically whether the function yields or returns 148 | outputMethod = (PyGenerator.class.isInstance(ret) ? OutputMethod.YIELDS 149 | : OutputMethod.RETURNS); 150 | } 151 | if (outputMethod == OutputMethod.RETURNS) { 152 | // We're simply returning records 153 | // We can return None to produce no output 154 | if (PyNone.class.isInstance(ret)) 155 | return; 156 | castPythonObject(ret, outputCollector, false); 157 | } else { 158 | // We have a Python generator that yields records 159 | for (Object record : (PyGenerator) ret) { 160 | if (record != null) { 161 | castPythonObject(record, outputCollector, true); 162 | } 163 | } 164 | } 165 | } 166 | 167 | public void setOutputMethod(OutputMethod outputMethod) { 168 | this.outputMethod = outputMethod; 169 | } 170 | 171 | public void setOutputType(OutputType outputType) { 172 | this.outputType = outputType; 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/Main.java: -------------------------------------------------------------------------------- 1 | package com.twitter.pycascading; 2 | 3 | import java.util.Properties; 4 | 5 | import org.python.util.PythonInterpreter; 6 | 7 | public class Main { 8 | 9 | private static PythonInterpreter interpreter = null; 10 | 11 | /** 12 | * This is the main method that gets passed to Hadoop, or executed in local 13 | * mode. 14 | * 15 | * @param args 16 | * the command line arguments 17 | * @throws Exception 18 | */ 19 | public static void main(String[] args) throws Exception { 20 | Properties sysProps = System.getProperties(); 21 | Properties props = new Properties(); 22 | props.put("python.cachedir", sysProps.get("user.home") + "/.jython-cache"); 23 | props.put("python.cachedir.skip", "0"); 24 | PythonInterpreter.initialize(System.getProperties(), props, args); 25 | getInterpreter().execfile(args[0]); 26 | } 27 | 28 | /** 29 | * Create and return the Python interpreter (singleton per JVM). 30 | * 31 | * @return the Python interpreter 32 | */ 33 | public static PythonInterpreter getInterpreter() { 34 | if (interpreter == null) { 35 | interpreter = new PythonInterpreter(); 36 | } 37 | return interpreter; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/MetaScheme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.IOException; 18 | import java.io.ObjectInputStream; 19 | import java.io.ObjectOutputStream; 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.fs.FSDataInputStream; 23 | import org.apache.hadoop.fs.FSDataOutputStream; 24 | import org.apache.hadoop.fs.FileSystem; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.mapred.JobConf; 27 | import org.apache.hadoop.mapred.OutputCollector; 28 | 29 | import cascading.scheme.Scheme; 30 | import cascading.tap.Tap; 31 | import cascading.tuple.Fields; 32 | import cascading.tuple.Tuple; 33 | import cascading.tuple.TupleEntry; 34 | 35 | /** 36 | * A Cascading Scheme that stores header information for an output dataset. It 37 | * records all formatting information so that later on the tuple field names and 38 | * types can be reloaded without having to specify them explicitly. 39 | * 40 | * It also stores the original scheme object so that at load time we don't have 41 | * to worry about that either. 42 | * 43 | * @author Gabor Szabo 44 | */ 45 | public class MetaScheme extends Scheme { 46 | private static final long serialVersionUID = 8194175541999063797L; 47 | 48 | private static final String schemeFileName = ".pycascading_scheme"; 49 | private static final String headerFileName = ".pycascading_header"; 50 | private static final String typeFileName = ".pycascading_types"; 51 | 52 | private Scheme scheme; 53 | private String outputPath; 54 | private boolean firstLine = true; 55 | private boolean typeFileToWrite = true; 56 | 57 | /** 58 | * Call this to get the original Cascading scheme that the data was written 59 | * in. 60 | * 61 | * @param inputPath 62 | * The path to where the scheme information was stored (normally the 63 | * same as the path to the data) 64 | * @return The Cascading scheme that was used when the data was written. 65 | * @throws IOException 66 | */ 67 | public static Scheme getSourceScheme(String inputPath) throws IOException { 68 | Path path = new Path(inputPath + "/" + schemeFileName); 69 | FileSystem fs = path.getFileSystem(new Configuration()); 70 | try { 71 | FSDataInputStream file = fs.open(path); 72 | ObjectInputStream ois = new ObjectInputStream(file); 73 | Scheme scheme = (Scheme) ois.readObject(); 74 | Fields fields = (Fields) ois.readObject(); 75 | scheme.setSourceFields(fields); 76 | ois.close(); 77 | file.close(); 78 | return scheme; 79 | } catch (ClassNotFoundException e) { 80 | throw new IOException("Could not read PyCascading file header: " + inputPath + "/" 81 | + schemeFileName); 82 | } 83 | } 84 | 85 | /** 86 | * Returns the scheme that will store field information and the scheme in 87 | * outputPath. Additionally, a file called .pycascading_header will be 88 | * generated, which stores the names of the fields in a TAB-delimited format. 89 | * 90 | * @param scheme 91 | * The Cascading scheme to be used to store the data 92 | * @param outputPath 93 | * Path were the metainformation about the scheme and field names 94 | * should be stored 95 | * @return A scheme that can be used to sink the data into 96 | * @throws IOException 97 | */ 98 | public static Scheme getSinkScheme(Scheme scheme, String outputPath) throws IOException { 99 | return new MetaScheme(scheme, outputPath); 100 | } 101 | 102 | protected MetaScheme(Scheme scheme, String outputPath) throws IOException { 103 | this.scheme = scheme; 104 | this.outputPath = outputPath; 105 | } 106 | 107 | @Override 108 | public void sourceInit(Tap tap, JobConf conf) throws IOException { 109 | // We're returning the original storage scheme, so this should not be called 110 | // ever. 111 | } 112 | 113 | @Override 114 | public Tuple source(Object key, Object value) { 115 | // This should never be called. 116 | return null; 117 | } 118 | 119 | @Override 120 | public void sinkInit(Tap tap, JobConf conf) throws IOException { 121 | scheme.sinkInit(tap, conf); 122 | } 123 | 124 | @Override 125 | public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException { 126 | // TODO: do it so such that we don't need to specify /user/gabor if the path 127 | // doesn't start with / 128 | if (firstLine) { 129 | Path path = new Path(outputPath + "/" + headerFileName); 130 | FileSystem fs = path.getFileSystem(new Configuration()); 131 | try { 132 | // We're trying to create the file by just one of the mappers/reducers, 133 | // the one that can do it first 134 | if (fs.createNewFile(path)) { 135 | FSDataOutputStream stream = fs.create(path, true); 136 | boolean firstField = true; 137 | for (Comparable field : tupleEntry.getFields()) { 138 | if (firstField) 139 | firstField = false; 140 | else 141 | stream.writeBytes("\t"); 142 | stream.writeBytes(field.toString()); 143 | } 144 | stream.writeBytes("\n"); 145 | stream.close(); 146 | } 147 | } catch (IOException e) { 148 | } 149 | 150 | path = new Path(outputPath + "/" + schemeFileName); 151 | fs = path.getFileSystem(new Configuration()); 152 | try { 153 | if (fs.createNewFile(path)) { 154 | FSDataOutputStream stream = fs.create(path, true); 155 | ObjectOutputStream ostream = new ObjectOutputStream(stream); 156 | ostream.writeObject(scheme); 157 | ostream.writeObject(tupleEntry.getFields()); 158 | ostream.close(); 159 | stream.close(); 160 | } 161 | } catch (IOException e) { 162 | } 163 | 164 | firstLine = false; 165 | } 166 | 167 | if (typeFileToWrite) { 168 | Path path = new Path(outputPath + "/" + typeFileName); 169 | FileSystem fs = path.getFileSystem(new Configuration()); 170 | try { 171 | if (fs.createNewFile(path)) { 172 | FSDataOutputStream stream = fs.create(path, true); 173 | for (int i = 0; i < tupleEntry.size(); i++) { 174 | Comparable fieldName = null; 175 | if (tupleEntry.getFields().size() < tupleEntry.size()) { 176 | // We don't have names for the fields 177 | fieldName = ""; 178 | } else { 179 | fieldName = tupleEntry.getFields().get(i) + "\t"; 180 | } 181 | Object object = tupleEntry.getObject(i); 182 | Class objectClass = (object == null ? Object.class : object.getClass()); 183 | stream.writeBytes(fieldName + objectClass.getName() + "\n"); 184 | } 185 | stream.close(); 186 | } 187 | } catch (IOException e) { 188 | } 189 | typeFileToWrite = false; 190 | } 191 | scheme.sink(tupleEntry, outputCollector); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/PythonEnvironment.java: -------------------------------------------------------------------------------- 1 | package com.twitter.pycascading; 2 | 3 | import org.python.util.PythonInterpreter; 4 | 5 | /** 6 | * This is the class that holds the Python environment running on a mapper or 7 | * reducer, including the Python interpreter. 8 | * 9 | * @author Gabor Szabo 10 | */ 11 | public class PythonEnvironment { 12 | private PythonInterpreter interpreter; 13 | 14 | /** 15 | * Start a new Jython interpreter if it's not started yet. 16 | * 17 | * @return the interpreter instance 18 | */ 19 | public PythonInterpreter getPythonInterpreter() { 20 | if (interpreter == null) 21 | interpreter = new PythonInterpreter(); 22 | return interpreter; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/PythonObjectInputStream.java: -------------------------------------------------------------------------------- 1 | package com.twitter.pycascading; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.ObjectInputStream; 6 | 7 | import org.python.core.Py; 8 | import org.python.core.PyObject; 9 | import org.python.core.PyTuple; 10 | import org.python.util.PythonInterpreter; 11 | 12 | /** 13 | * When deserializing the job, this class reconstructs the Python functions 14 | * given by their name and/or source. 15 | * 16 | * @author Gabor Szabo 17 | */ 18 | public class PythonObjectInputStream extends ObjectInputStream { 19 | 20 | private PythonInterpreter interpreter; 21 | 22 | public PythonObjectInputStream(InputStream in, PythonInterpreter interpreter) throws IOException { 23 | super(in); 24 | this.interpreter = interpreter; 25 | enableResolveObject(true); 26 | } 27 | 28 | @Override 29 | protected Object resolveObject(Object obj) throws IOException { 30 | // This method will reconstruct the PyFunction based on its name or its 31 | // source if it was a closure 32 | if (obj instanceof SerializedPythonFunction) { 33 | PyTuple serializedFunction = ((SerializedPythonFunction) obj).getSerializedFunction(); 34 | String functionType = (String) serializedFunction.get(0); 35 | String functionName = (String) serializedFunction.get(3); 36 | PyObject function = null; 37 | if ("global".equals(functionType)) { 38 | function = interpreter.get(functionName); 39 | } else if ("closure".equals(functionType)) { 40 | interpreter.exec((String) serializedFunction.get(4)); 41 | function = interpreter.get(functionName); 42 | } 43 | return function; 44 | } else 45 | return obj; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/PythonObjectOutputStream.java: -------------------------------------------------------------------------------- 1 | package com.twitter.pycascading; 2 | 3 | import java.io.IOException; 4 | import java.io.ObjectOutputStream; 5 | import java.io.OutputStream; 6 | 7 | import org.python.core.Py; 8 | import org.python.core.PyFunction; 9 | import org.python.core.PyNone; 10 | import org.python.core.PyObject; 11 | import org.python.core.PyTuple; 12 | 13 | /** 14 | * This class replaces every function object with a pointer to its name and/or 15 | * source, so that we can reconstruct the function when deserializing. We need 16 | * to do it this way as PyFunctions cannot be serialized (some nested Jython 17 | * objects don't implement Serializable). 18 | * 19 | * @author Gabor Szabo 20 | */ 21 | public class PythonObjectOutputStream extends ObjectOutputStream { 22 | 23 | private PyFunction callBack; 24 | 25 | public PythonObjectOutputStream(OutputStream out, PyFunction callBack) throws IOException { 26 | super(out); 27 | this.callBack = callBack; 28 | enableReplaceObject(true); 29 | } 30 | 31 | @Override 32 | protected Object replaceObject(Object obj) throws IOException { 33 | if (obj instanceof PyFunction) { 34 | PyObject replaced = callBack.__call__((PyObject) obj); 35 | if (!(replaced instanceof PyNone)) { 36 | return new SerializedPythonFunction((PyFunction) obj, (PyTuple) replaced); 37 | } 38 | } 39 | return obj; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/SelectFields.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.Serializable; 18 | 19 | import cascading.flow.FlowProcess; 20 | import cascading.operation.BaseOperation; 21 | import cascading.operation.Function; 22 | import cascading.operation.FunctionCall; 23 | import cascading.operation.OperationCall; 24 | import cascading.tuple.Fields; 25 | import cascading.tuple.Tuple; 26 | import cascading.tuple.TupleEntry; 27 | import cascading.tuple.TupleEntryCollector; 28 | 29 | /** 30 | * Simple Cascading function that keeps the specified fields only in the tuple 31 | * stream. 32 | * 33 | * @author Gabor Szabo 34 | */ 35 | public class SelectFields extends BaseOperation implements Function, Serializable { 36 | private static final long serialVersionUID = -6859909716154224842L; 37 | 38 | private Fields filteredFields; 39 | 40 | public SelectFields(Fields filteredFields) { 41 | super(filteredFields); 42 | this.filteredFields = filteredFields; 43 | } 44 | 45 | @Override 46 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 47 | super.prepare(flowProcess, operationCall); 48 | } 49 | 50 | @Override 51 | public void operate(FlowProcess flowProcess, FunctionCall functionCall) { 52 | TupleEntry inputTuple = functionCall.getArguments(); 53 | TupleEntryCollector outputCollector = functionCall.getOutputCollector(); 54 | Tuple outputTuple = new Tuple(); 55 | 56 | for (Comparable field : filteredFields) { 57 | // We cannot use inputTuple.get(...) here, as that tries to convert 58 | // the field value to a Comparable. In case we have a complex Python 59 | // type as a field, that won't work. 60 | outputTuple.add(inputTuple.getObject(field)); 61 | } 62 | outputCollector.add(outputTuple); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/SerializedPythonFunction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | /* 17 | * This is a class the helps in serializing Jython functions. It seems that Jython 18 | * functions cannot be serialized, because on the remote end a Jython interpreter 19 | * has to also be invoked that can interpret the function. 20 | * 21 | * Thus when deserializing, we need to start a Jython interpreter and read the 22 | * source file where the function was defined in the first place. This also means 23 | * that we cannot use lambda functions as these cannot be referred to by name. 24 | * Referring to functions by name is important as it's the function's name and 25 | * source file that is sent through when serializing. 26 | * 27 | * It only works with Jython >= 2.5.2 because of a previous bug with serializing 28 | * PyCode (http://bugs.jython.org/issue1601) 29 | * Still I need to use a custom class loader, because there's a field in PyCode 30 | * whose class is called "org.python.pycode._pyx0" but such a class does not exist. 31 | * 32 | * When invoking a function, the globals are not restored for that function. Thus 33 | * for instance imports of Tuples etc. need to be done within the function. I tried 34 | * to serialize the globals together with func_code, but org.python.core.packagecache.SysPackageManager 35 | * in Jython is not serializable, and it is apparently appears in the globals. Tried 36 | * to recompile Jython from sources, but there're too many external libraries missing. 37 | * 38 | * Unortunately Cascading serializes Function objects, but Jython cannot 39 | * serialize PyFunctions due to bugs. But Jython 2.5.2 can serialize 40 | * func_codes, so we work it around with that and saving the globals 41 | * separately in a static variable. 42 | */ 43 | 44 | package com.twitter.pycascading; 45 | 46 | import java.io.IOException; 47 | import java.io.ObjectInputStream; 48 | import java.io.ObjectOutputStream; 49 | import java.io.Serializable; 50 | 51 | import org.python.core.PyFunction; 52 | import org.python.core.PyObject; 53 | import org.python.core.PyTuple; 54 | 55 | /** 56 | * Class that is primarily responsible for serializing and deserializing a 57 | * Jython function. It does this by storing the name of the function and 58 | * reloading the interpreter and source where the function was defined when it 59 | * becomes necessary to deserialize. 60 | * 61 | * @author Gabor Szabo 62 | */ 63 | public class SerializedPythonFunction implements Serializable { 64 | private static final long serialVersionUID = 4944819638591252128L; 65 | 66 | private PyObject pythonFunction; 67 | private PyTuple serializedFunction; 68 | 69 | /** 70 | * This constructor is necessary for the deserialization. 71 | */ 72 | public SerializedPythonFunction() { 73 | } 74 | 75 | public SerializedPythonFunction(PyFunction function, PyTuple serializedReturn) { 76 | serializedFunction = serializedReturn; 77 | pythonFunction = function; 78 | } 79 | 80 | private void writeObject(ObjectOutputStream stream) throws IOException { 81 | stream.writeObject(serializedFunction); 82 | } 83 | 84 | private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { 85 | serializedFunction = (PyTuple) stream.readObject(); 86 | } 87 | 88 | public PyObject getPythonFunction() { 89 | return pythonFunction; 90 | } 91 | 92 | public PyTuple getSerializedFunction() { 93 | return serializedFunction; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/TemporaryHdfs.java: -------------------------------------------------------------------------------- 1 | package com.twitter.pycascading; 2 | 3 | import java.io.IOException; 4 | import java.util.Random; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | 10 | import cascading.flow.Flow; 11 | import cascading.flow.FlowListener; 12 | 13 | public class TemporaryHdfs implements FlowListener { 14 | private boolean tmpDirCreated = false; 15 | private String tmpDir; 16 | 17 | @Override 18 | public void onStarting(Flow flow) { 19 | } 20 | 21 | @Override 22 | public void onStopping(Flow flow) { 23 | removeTmpDir(); 24 | } 25 | 26 | @Override 27 | public void onCompleted(Flow flow) { 28 | removeTmpDir(); 29 | } 30 | 31 | @Override 32 | public boolean onThrowable(Flow flow, Throwable throwable) { 33 | removeTmpDir(); 34 | throwable.printStackTrace(); 35 | return false; 36 | } 37 | 38 | private String getRandomFileName() { 39 | String name = ""; 40 | Random rnd = new Random(); 41 | for (int i = 0; i < 6; i++) { 42 | name += (char) ((int) 'a' + rnd.nextInt((int) 'z' - (int) 'a')); 43 | } 44 | return name; 45 | } 46 | 47 | /** 48 | * Create a temporary folder on HDFS. The folder will be deleted after 49 | * execution or on an exception. 50 | * 51 | * @param conf 52 | * the jobconf 53 | * @throws IOException 54 | */ 55 | String createTmpFolder(Configuration conf) throws IOException { 56 | // Only fs.default.name and hadoop.tmp.dir are defined at the time of the 57 | // job initialization, we cannot use mapreduce.job.dir, mapred.working.dir, 58 | // or mapred.job.id 59 | // Possibly use Hfs.getTempDir later from Cascading. 60 | // In tmpDir, I cannot put a / in between the two variables, otherwise 61 | // Hadoop will fail to copy the archive to the temporary folder 62 | tmpDir = conf.get("fs.default.name") + conf.get("hadoop.tmp.dir"); 63 | tmpDir = tmpDir + "/" + "pycascading-" + getRandomFileName(); 64 | Path path = new Path(tmpDir); 65 | FileSystem fs = path.getFileSystem(new Configuration()); 66 | fs.mkdirs(path); 67 | tmpDirCreated = true; 68 | return tmpDir; 69 | } 70 | 71 | /** 72 | * Removes the temporary folder we created. 73 | */ 74 | private void removeTmpDir() { 75 | if (tmpDirCreated) { 76 | Path path = new Path(tmpDir); 77 | try { 78 | FileSystem fs = path.getFileSystem(new Configuration()); 79 | fs.delete(path, true); 80 | } catch (IOException e) { 81 | e.printStackTrace(); 82 | } 83 | } 84 | } 85 | 86 | private String getExtension(String path) { 87 | int i = path.lastIndexOf('.'); 88 | return (i >= 0 ? path.substring(i, path.length()) : ""); 89 | } 90 | 91 | /** 92 | * Copies a local file to HDFS, which is used as the distributed cache. The 93 | * distribute cache basically just takes this HDFS folder, and copies its 94 | * contents to the local disks for the mappers/reducers. Also, if the file is 95 | * a compressed archive, it will extract it locally. We generate a random file 96 | * name for the destination, but keep the extension so that zip and tgz 97 | * archives are recognized. 98 | * 99 | * @param source 100 | * the path to the local file to be distributed 101 | * @return the path to the HDFS file 102 | * @throws IOException 103 | * if the copy was unsuccessful 104 | */ 105 | public String copyFromLocalFileToHDFS(String source) throws IOException { 106 | Path src = new Path(source); 107 | String destName = tmpDir + "/" + getRandomFileName() + getExtension(source); 108 | Path dest = new Path(destName); 109 | FileSystem fs = dest.getFileSystem(new Configuration()); 110 | fs.copyFromLocalFile(src, dest); 111 | return destName; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/Util.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading; 16 | 17 | import java.io.IOException; 18 | import java.net.URISyntaxException; 19 | import java.util.Map; 20 | import java.util.Properties; 21 | 22 | import org.apache.hadoop.conf.Configuration; 23 | 24 | import cascading.flow.Flow; 25 | import cascading.flow.FlowConnector; 26 | import cascading.flow.FlowListener; 27 | import cascading.pipe.Pipe; 28 | import cascading.tap.Tap; 29 | 30 | /** 31 | * Helper cass that sets up the MR environment and runs a Cascading Flow. 32 | * 33 | * @author Gabor Szabo 34 | */ 35 | public class Util { 36 | // http://www.velocityreviews.com/forums/t147526-how-to-get-jar-file-name.html 37 | /** 38 | * Get the temporary folder where the job jar was extracted to by Hadoop. 39 | * 40 | * TODO: This only works if we distribute PyCascading as classes. If I switch 41 | * to using jars, I need to remove the last part of the path which is the jar 42 | * file. 43 | * 44 | * @return the temporary folder with the contents of the job jar 45 | */ 46 | public static String getJarFolder() { 47 | try { 48 | return Util.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath(); 49 | } catch (URISyntaxException e) { 50 | throw new RuntimeException("Could not get temporary job folder"); 51 | } 52 | } 53 | 54 | /** 55 | * Get the Cascading jar file on the local file system. 56 | * 57 | * @return the file location on the Hadoop worker for the Cascading jar 58 | */ 59 | public static String getCascadingJar() { 60 | try { 61 | return cascading.pipe.Pipe.class.getProtectionDomain().getCodeSource().getLocation().toURI() 62 | .getPath(); 63 | } catch (URISyntaxException e) { 64 | throw new RuntimeException("Could not get the location of the Cascading jar"); 65 | } 66 | } 67 | 68 | /** 69 | * We use the "pycascading.root" Java system property to store the location of 70 | * the Python sources for PyCascading. This is only used in local mode. This 71 | * is needed so that we know where to set the import path when we start up the 72 | * mappers and reducers. 73 | * 74 | * @param root 75 | * the location of the PyCascading sources on the local file system 76 | */ 77 | public static void setPycascadingRoot(String root) { 78 | System.setProperty("pycascading.root", root); 79 | } 80 | 81 | public static void run(int numReducers, Map config, Map sources, 82 | Map sinks, Pipe... tails) throws IOException, URISyntaxException { 83 | // String strClassPath = System.getProperty("java.class.path"); 84 | // System.out.println("Classpath is " + strClassPath); 85 | 86 | Properties properties = new Properties(); 87 | properties.put("mapred.reduce.tasks", numReducers); 88 | // Set this to change the default block size that is routed to one mapper 89 | // It won't help if the files are smaller than this as each file will go to 90 | // one mapper 91 | // properties.put("mapred.min.split.size", 20 * 1024 * 1024 * 1024L); 92 | // properties.put("mapred.map.tasks", 4000); 93 | // So that Thrift classes can be serialized 94 | // We need to add WritableSerialization otherwise sometimes Cascading and 95 | // Hadoop don't pick it up, and BigInteger serializations fail 96 | // See https://github.com/twitter/pycascading/issues/2 97 | // TODO: find the reason for this 98 | properties.put("io.serializations", 99 | "com.twitter.pycascading.bigintegerserialization.BigIntegerSerialization," 100 | + "org.apache.hadoop.io.serializer.WritableSerialization," 101 | + "com.twitter.pycascading.pythonserialization.PythonSerialization"); 102 | properties.put("mapred.jobtracker.completeuserjobs.maximum", 50000); 103 | properties.put("mapred.input.dir.recursive", "true"); 104 | 105 | // Set the running mode in the jobconf so that the mappers/reducers can 106 | // easily check this. 107 | String runningMode = (String) config.get("pycascading.running_mode"); 108 | properties.setProperty("pycascading.running_mode", runningMode); 109 | properties.setProperty("pycascading.main_file", (String) config.get("pycascading.main_file")); 110 | 111 | Configuration conf = new Configuration(); 112 | TemporaryHdfs tempDir = null; 113 | if ("hadoop".equals(runningMode)) { 114 | tempDir = new TemporaryHdfs(); 115 | // We put the files to be distributed into the distributed cache 116 | // The pycascading.distributed_cache.archives variable was set by 117 | // bootstrap.py, based on the command line parameters where we specified 118 | // the PyCascading & source archives 119 | Object archives = config.get("pycascading.distributed_cache.archives"); 120 | if (archives != null) { 121 | tempDir = new TemporaryHdfs(); 122 | String tempDirLocation = tempDir.createTmpFolder(conf); 123 | String dests = null; 124 | for (String archive : (Iterable) archives) { 125 | String dest = tempDir.copyFromLocalFileToHDFS(archive); 126 | dests = (dests == null ? dest : dests + "," + dest); 127 | } 128 | // Set the distributed cache to the files we just copied to HDFS 129 | // 130 | // This is an ugly hack, we should use DistributedCache. 131 | // DistributedCache however operates on a JobConf, and since 132 | // Cascading expects a Map, we cannot directly pass 133 | // in the parameters set into a JobConf. 134 | // TODO: see if a later version of Cascading can update its properties 135 | // using a JobConf 136 | properties.setProperty("mapred.cache.archives", dests); 137 | // This creates a symlink for each of the mappers/reducers to the 138 | // localized files, instead of copying them for each one. This way we 139 | // reduce the overhead for copying on one worker machine. 140 | // TODO: see the one just above 141 | properties.setProperty("mapred.create.symlink", "yes"); 142 | } 143 | } 144 | 145 | FlowConnector.setApplicationJarClass(properties, Main.class); 146 | FlowConnector flowConnector = new FlowConnector(properties); 147 | Flow flow = flowConnector.connect(sources, sinks, tails); 148 | if ("hadoop".equals(runningMode)) { 149 | try { 150 | flow.addListener(tempDir); 151 | } catch (Exception e) { 152 | e.printStackTrace(); 153 | } 154 | } else { 155 | try { 156 | flow.addListener(new FlowListener() { 157 | 158 | @Override 159 | public void onStarting(Flow flow) { 160 | } 161 | 162 | @Override 163 | public void onStopping(Flow flow) { 164 | } 165 | 166 | @Override 167 | public void onCompleted(Flow flow) { 168 | } 169 | 170 | @Override 171 | public boolean onThrowable(Flow flow, Throwable throwable) { 172 | throwable.printStackTrace(); 173 | return false; 174 | } 175 | }); 176 | } catch (Exception e) { 177 | e.printStackTrace(); 178 | } 179 | } 180 | flow.complete(); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.bigintegerserialization; 16 | 17 | import java.io.DataInputStream; 18 | import java.io.IOException; 19 | import java.io.Serializable; 20 | import java.math.BigInteger; 21 | import java.util.Comparator; 22 | 23 | import org.apache.hadoop.io.WritableUtils; 24 | 25 | import cascading.tuple.StreamComparator; 26 | import cascading.tuple.hadoop.BufferedInputStream; 27 | 28 | /** 29 | * Cascading in-stream comparator for Java BigIntegers. 30 | * 31 | * @author Gabor Szabo 32 | */ 33 | public class BigIntegerComparator implements StreamComparator, 34 | Comparator, Serializable { 35 | private static final long serialVersionUID = 3846289449409826723L; 36 | 37 | public BigIntegerComparator(Class type) { 38 | } 39 | 40 | public int compare(BufferedInputStream lhsStream, BufferedInputStream rhsStream) { 41 | try { 42 | DataInputStream inLeft = new DataInputStream(lhsStream); 43 | DataInputStream inRight = new DataInputStream(rhsStream); 44 | 45 | long lhs = WritableUtils.readVLong(inLeft); 46 | long rhs = WritableUtils.readVLong(inRight); 47 | 48 | if (lhs < rhs) 49 | return -1; 50 | else if (lhs > rhs) 51 | return 1; 52 | else 53 | return 0; 54 | } catch (IOException ioe) { 55 | throw new RuntimeException(ioe); 56 | } 57 | } 58 | 59 | public int compare(BigInteger o1, BigInteger o2) { 60 | return o1.compareTo(o2); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerDeserializer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.bigintegerserialization; 16 | 17 | import java.io.DataInputStream; 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.math.BigInteger; 21 | 22 | import org.apache.hadoop.io.WritableUtils; 23 | import org.apache.hadoop.io.serializer.Deserializer; 24 | 25 | /** 26 | * Hadoop Deserializer for Java BigIntegers. 27 | * 28 | * @author Gabor Szabo 29 | */ 30 | public class BigIntegerDeserializer implements Deserializer { 31 | private DataInputStream in; 32 | 33 | public BigIntegerDeserializer(Class c) { 34 | } 35 | 36 | public void open(InputStream inStream) throws IOException { 37 | in = new DataInputStream(inStream); 38 | } 39 | 40 | public BigInteger deserialize(BigInteger i) throws IOException { 41 | return BigInteger.valueOf(WritableUtils.readVLong(in)); 42 | } 43 | 44 | public void close() throws IOException { 45 | if (in != null) { 46 | in.close(); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerSerialization.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.bigintegerserialization; 16 | 17 | import cascading.tuple.Comparison; 18 | 19 | import java.math.BigInteger; 20 | import java.util.Comparator; 21 | import org.apache.hadoop.io.serializer.Deserializer; 22 | import org.apache.hadoop.io.serializer.Serialization; 23 | import org.apache.hadoop.io.serializer.Serializer; 24 | 25 | /** 26 | * Hadoop Serialization class for Java BigIntegers. 27 | * 28 | * @author Gabor Szabo 29 | */ 30 | public class BigIntegerSerialization implements Serialization, Comparison { 31 | 32 | public boolean accept(Class c) { 33 | boolean ret = BigInteger.class.isAssignableFrom(c); 34 | return ret; 35 | } 36 | 37 | public Deserializer getDeserializer(Class c) { 38 | return new BigIntegerDeserializer(c); 39 | } 40 | 41 | public Serializer getSerializer(Class c) { 42 | return new BigIntegerSerializer(); 43 | } 44 | 45 | public Comparator getComparator(Class type) { 46 | return new BigIntegerComparator(type); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/bigintegerserialization/BigIntegerSerializer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.bigintegerserialization; 16 | 17 | import java.io.DataOutputStream; 18 | import java.io.IOException; 19 | import java.io.OutputStream; 20 | import java.math.BigInteger; 21 | 22 | import org.apache.hadoop.io.WritableUtils; 23 | import org.apache.hadoop.io.serializer.Serializer; 24 | 25 | /** 26 | * Hadoop Serializer for Java BigIntegers. 27 | * 28 | * @author Gabor Szabo 29 | */ 30 | public class BigIntegerSerializer implements Serializer { 31 | private DataOutputStream out; 32 | 33 | public void open(OutputStream outStream) throws IOException { 34 | out = new DataOutputStream(outStream); 35 | } 36 | 37 | public void serialize(BigInteger i) throws IOException { 38 | WritableUtils.writeVLong(out, i.longValue()); 39 | } 40 | 41 | public void close() throws IOException { 42 | if (out != null) { 43 | out.close(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/pythonserialization/PythonDeserializer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.pythonserialization; 16 | 17 | import java.io.DataInputStream; 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.io.ObjectInputStream; 21 | 22 | import org.apache.hadoop.io.serializer.Deserializer; 23 | import org.python.core.PyObject; 24 | 25 | /** 26 | * Hadoop Deserializer for Python objects. It works, but it's slow so do not use 27 | * in serious production. 28 | * 29 | * @author Gabor Szabo 30 | */ 31 | public class PythonDeserializer implements Deserializer { 32 | private DataInputStream inStream; 33 | 34 | public PythonDeserializer(Class c) { 35 | } 36 | 37 | public void open(InputStream inStream) throws IOException { 38 | if (inStream instanceof DataInputStream) 39 | this.inStream = (DataInputStream) inStream; 40 | else 41 | this.inStream = new DataInputStream(inStream); 42 | } 43 | 44 | public PyObject deserialize(PyObject i) throws IOException { 45 | try { 46 | ObjectInputStream in = new ObjectInputStream(inStream); 47 | PyObject ret = (PyObject) in.readObject(); 48 | return ret; 49 | } catch (ClassNotFoundException e) { 50 | throw new IOException("Jython class not found"); 51 | } 52 | } 53 | 54 | public void close() throws IOException { 55 | if (inStream != null) { 56 | inStream.close(); 57 | inStream = null; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/pythonserialization/PythonSerialization.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.pythonserialization; 16 | 17 | import org.apache.hadoop.io.serializer.Deserializer; 18 | import org.apache.hadoop.io.serializer.Serialization; 19 | import org.apache.hadoop.io.serializer.Serializer; 20 | import org.python.core.PyObject; 21 | 22 | /** 23 | * Hadoop Serialization class for Python objects. 24 | * 25 | * @author Gabor Szabo 26 | */ 27 | public class PythonSerialization implements Serialization { 28 | 29 | public boolean accept(Class c) { 30 | boolean ret = PyObject.class.isAssignableFrom(c); 31 | return ret; 32 | } 33 | 34 | public Deserializer getDeserializer(Class c) { 35 | return new PythonDeserializer(c); 36 | } 37 | 38 | public Serializer getSerializer(Class c) { 39 | return new PythonSerializer(); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /java/src/com/twitter/pycascading/pythonserialization/PythonSerializer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2011 Twitter, Inc. 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | package com.twitter.pycascading.pythonserialization; 16 | 17 | import java.io.DataOutputStream; 18 | import java.io.IOException; 19 | import java.io.ObjectOutputStream; 20 | import java.io.OutputStream; 21 | 22 | import org.apache.hadoop.io.serializer.Serializer; 23 | import org.python.core.PyObject; 24 | 25 | /** 26 | * Hadoop Serializer for Python objects. 27 | * 28 | * This is suboptimal, slow, and produces bloated streams, and should not be 29 | * used in production. In other words it just demonstrates the use of serialized 30 | * Python objects. 31 | * 32 | * @author Gabor Szabo 33 | */ 34 | public class PythonSerializer implements Serializer { 35 | private DataOutputStream outStream; 36 | 37 | public void open(OutputStream outStream) throws IOException { 38 | if (outStream instanceof DataOutputStream) 39 | this.outStream = (DataOutputStream) outStream; 40 | else 41 | this.outStream = new DataOutputStream(outStream); 42 | } 43 | 44 | public void serialize(PyObject i) throws IOException { 45 | // We have to create an ObjectOutputStream here. If we do it in open(...), 46 | // the following exception will be thrown on the reducers from 47 | // PythonDeserializer with large jobs: 48 | // java.io.StreamCorruptedException: invalid stream header: 7371007E 49 | // TODO: check if a flush wouldn't be enough 50 | ObjectOutputStream out = new ObjectOutputStream(outStream); 51 | out.writeObject(i); 52 | // We need to flush the stream, otherwise we get corrupted object stream 53 | // header exceptions as above. 54 | // Also do not use close(), as that would close result in 55 | // java.io.IOException: write beyond end of stream exceptions on spilled 56 | // cogroups. 57 | out.flush(); 58 | } 59 | 60 | public void close() throws IOException { 61 | if (outStream != null) { 62 | outStream.close(); 63 | outStream = null; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /local_run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright 2011 Twitter, Inc. 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # 19 | # Runs the PyCascading locally without Hadoop 20 | # 21 | 22 | 23 | usage() 24 | { 25 | cat < [parameters] 29 | 30 | Runs the PyCascading script locally, without a Hadoop cluster. 31 | 32 | Options: 33 | -h Show this message 34 | -j Additional jar files and Python import folders to be added 35 | to the classpath. cp is a list of file and folder locations 36 | separated by ":"s 37 | 38 | EOF 39 | } 40 | 41 | 42 | while getopts ":hj:" OPTION; do 43 | case $OPTION in 44 | h) usage 45 | exit 1 46 | ;; 47 | j) additional_jars="$OPTARG" 48 | ;; 49 | esac 50 | done 51 | shift $((OPTIND-1)) 52 | 53 | main_file="$1" 54 | if [ "$main_file" == "" ]; then 55 | usage 56 | exit 1 57 | fi 58 | 59 | home_dir=$(dirname "$0") 60 | source "$home_dir/java/dependencies.properties" 61 | 62 | classpath="$home_dir/build/classes" 63 | 64 | function add2classpath 65 | { 66 | for lib in $1; do 67 | for file in $(ls $2/$lib); do 68 | classpath="$classpath:$file" 69 | done 70 | done 71 | } 72 | 73 | # Jython jars 74 | jython_libs='jython.jar' 75 | add2classpath "$jython_libs" "$jython" 76 | 77 | # Cascading jars 78 | cascading_libs='cascading-[0-9].*.jar lib/jgrapht-*.jar' 79 | add2classpath "$cascading_libs" "$cascading" 80 | 81 | # Hadoop jars 82 | hadoop_libs='hadoop-*core*.jar lib/*.jar' 83 | add2classpath "$hadoop_libs" "$hadoop" 84 | 85 | if [ "$additional_jars" != "" ]; then 86 | classpath="$classpath:$additional_jars" 87 | fi 88 | 89 | # sys.path will be initialized from JYTHONPATH 90 | JYTHONPATH="$home_dir/python" java -classpath "$classpath" \ 91 | com.twitter.pycascading.Main "$home_dir/python/pycascading/bootstrap.py" \ 92 | local "$home_dir" "$@" 93 | -------------------------------------------------------------------------------- /python/pycascading/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """ 17 | PyCascading is a Python frontend to build and execute MapReduce flows 18 | in Cascading. 19 | """ 20 | -------------------------------------------------------------------------------- /python/pycascading/bootstrap.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Bootstrap the PyCascading script. 17 | 18 | This is the main Python module that gets executed by Hadoop or in local mode. 19 | The first command line argument is either 'local' or 'hadoop'. This determines 20 | whether we're running the script in local mode or with Hadoop. For Hadoop we 21 | need to pack the sources into a jar, which are extracted later to a temporary 22 | directory, so we need to set up the search paths differently in this case. 23 | """ 24 | 25 | __author__ = 'Gabor Szabo' 26 | 27 | 28 | import sys, imp 29 | 30 | 31 | if __name__ == "__main__": 32 | # The first command line parameter must be 'hadoop' or 'local' 33 | # to indicate the running mode 34 | running_mode = sys.argv[1] 35 | 36 | # The second is the location of the PyCascading Python sources in local 37 | # mode, and the PyCascading tarball in Hadoop mode 38 | python_dir = sys.argv[2] 39 | 40 | # Remove the first two arguments so that sys.argv will look like as 41 | # if it was coming from a simple command line execution 42 | # The further parameters are the command line parameters to the script 43 | sys.argv = sys.argv[3:] 44 | 45 | from com.twitter.pycascading import Util 46 | 47 | cascading_jar = Util.getCascadingJar() 48 | # This is the folder where Hadoop extracted the jar file for execution 49 | tmp_dir = Util.getJarFolder() 50 | 51 | Util.setPycascadingRoot(python_dir) 52 | 53 | # The initial value of sys.path is JYTHONPATH plus whatever Jython appends 54 | # to it (normally the Python standard libraries the come with Jython) 55 | sys.path.extend((cascading_jar, '.', tmp_dir, python_dir + '/python', 56 | python_dir + '/python/Lib')) 57 | 58 | # Allow the importing of user-installed Jython packages 59 | import site 60 | site.addsitedir(python_dir + 'python/Lib/site-packages') 61 | 62 | import os 63 | import encodings 64 | import pycascading.pipe, getopt 65 | 66 | # This holds some global configuration parameters 67 | pycascading.pipe.config = dict() 68 | 69 | opts, args = getopt.getopt(sys.argv, 'a:') 70 | pycascading.pipe.config['pycascading.distributed_cache.archives'] = [] 71 | for opt in opts: 72 | if opt[0] == '-a': 73 | pycascading.pipe.config['pycascading.distributed_cache.archives'] \ 74 | .append(opt[1]) 75 | 76 | # This is going to be seen by main() 77 | sys.argv = args 78 | 79 | # It's necessary to put this import here, otherwise simplejson won't work. 80 | # Maybe it's automatically imported in the beginning of a Jython program, 81 | # but since at that point the sys.path is not set yet to Lib, it will fail? 82 | # Instead, we can use Java's JSON decoder... 83 | # import encodings 84 | 85 | # pycascading.pipe.config is a dict with configuration parameters 86 | pycascading.pipe.config['pycascading.running_mode'] = running_mode 87 | pycascading.pipe.config['pycascading.main_file'] = args[0] 88 | 89 | # Import and run the user's script 90 | _main_module_ = imp.load_source('__main__', \ 91 | pycascading.pipe.config['pycascading.main_file']) 92 | _main_module_.main() 93 | -------------------------------------------------------------------------------- /python/pycascading/cogroup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Operations related to a CoGroup pipe.""" 17 | 18 | __author__ = 'Gabor Szabo' 19 | 20 | 21 | import cascading.pipe 22 | import cascading.pipe.cogroup 23 | import cascading.operation 24 | 25 | from pycascading.pipe import Operation, coerce_to_fields, _Stackable 26 | 27 | 28 | class CoGroup(Operation): 29 | 30 | """CoGroup two or more streams on common fields. 31 | 32 | This is a PyCascading wrapper around a Cascading CoGroup. 33 | """ 34 | 35 | def __init__(self, *args, **kwargs): 36 | """Create a Cascading CoGroup pipe. 37 | 38 | Arguments: 39 | args[0] -- the fields on which to join 40 | 41 | Keyword arguments: 42 | group_name -- the groupName parameter for Cascading 43 | group_fields -- the fields on which to group 44 | declared_fields -- the declaredFields parameter for Cascading 45 | result_group_fields -- the resultGroupFields parameter for Cascading 46 | joiner -- the joiner parameter for Cascading 47 | num_self_joins -- the numSelfJoins parameter for Cascading 48 | lhs -- the lhs parameter for Cascading 49 | lhs_group_fields -- the lhsGroupFields parameter for Cascading 50 | rhs -- the rhs parameter for Cascading 51 | rhs_group_fields -- the rhsGroupFields parameter for Cascading 52 | """ 53 | Operation.__init__(self) 54 | self.__args = args 55 | self.__kwargs = kwargs 56 | 57 | def __create_args(self, 58 | group_name=None, 59 | pipes=None, group_fields=None, declared_fields=None, 60 | result_group_fields=None, joiner=None, 61 | pipe=None, num_self_joins=None, 62 | lhs=None, lhs_group_fields=None, 63 | rhs=None, rhs_group_fields=None): 64 | # We can use an unnamed parameter only for group_fields 65 | if self.__args: 66 | group_fields = [coerce_to_fields(f) for f in self.__args[0]] 67 | args = [] 68 | if group_name: 69 | args.append(str(group_name)) 70 | if lhs: 71 | args.append(lhs.get_assembly()) 72 | args.append(coerce_to_fields(lhs_group_fields)) 73 | args.append(rhs.get_assembly()) 74 | args.append(coerce_to_fields(rhs_group_fields)) 75 | if declared_fields: 76 | args.append(coerce_to_fields(declared_fields)) 77 | if result_group_fields: 78 | args.append(coerce_to_fields(result_group_fields)) 79 | if joiner: 80 | args.append(joiner) 81 | elif pipes: 82 | args.append([p.get_assembly() for p in pipes]) 83 | if group_fields: 84 | args.append([coerce_to_fields(f) for f in group_fields]) 85 | if declared_fields: 86 | args.append(coerce_to_fields(declared_fields)) 87 | if result_group_fields: 88 | args.append(coerce_to_fields(result_group_fields)) 89 | else: 90 | args.append(None) 91 | if joiner is None: 92 | joiner = cascading.pipe.cogroup.InnerJoin() 93 | args.append(joiner) 94 | elif pipe: 95 | args.append(pipe.get_assembly()) 96 | args.append(coerce_to_fields(group_fields)) 97 | args.append(int(num_self_joins)) 98 | if declared_fields: 99 | args.append(coerce_to_fields(declared_fields)) 100 | if result_group_fields: 101 | args.append(coerce_to_fields(result_group_fields)) 102 | if joiner: 103 | args.append(joiner) 104 | return args 105 | 106 | def _create_with_parent(self, parent): 107 | if isinstance(parent, _Stackable): 108 | args = self.__create_args(pipes=parent.stack, **self.__kwargs) 109 | else: 110 | args = self.__create_args(pipe=parent, **self.__kwargs) 111 | return cascading.pipe.CoGroup(*args) 112 | 113 | 114 | def inner_join(*args, **kwargs): 115 | """Shortcut for an inner join.""" 116 | kwargs['joiner'] = cascading.pipe.cogroup.InnerJoin() 117 | if not 'declared_fields' in kwargs: 118 | kwargs['declared_fields'] = None 119 | return CoGroup(*args, **kwargs) 120 | 121 | 122 | def outer_join(*args, **kwargs): 123 | """Shortcut for an outer join.""" 124 | kwargs['joiner'] = cascading.pipe.cogroup.OuterJoin() 125 | if not 'declared_fields' in kwargs: 126 | kwargs['declared_fields'] = None 127 | return CoGroup(*args, **kwargs) 128 | 129 | 130 | def left_outer_join(*args, **kwargs): 131 | """Shortcut for a left outer join.""" 132 | # The documentation says a Cascading RightJoin is a right inner join, but 133 | # that's not true, it's really an outer join as it should be. 134 | kwargs['joiner'] = cascading.pipe.cogroup.LeftJoin() 135 | if not 'declared_fields' in kwargs: 136 | kwargs['declared_fields'] = None 137 | return CoGroup(*args, **kwargs) 138 | 139 | 140 | def right_outer_join(*args, **kwargs): 141 | """Shortcut for a right outer join.""" 142 | kwargs['joiner'] = cascading.pipe.cogroup.RightJoin() 143 | if not 'declared_fields' in kwargs: 144 | kwargs['declared_fields'] = None 145 | return CoGroup(*args, **kwargs) 146 | -------------------------------------------------------------------------------- /python/pycascading/decorators.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """ 17 | PyCascading function decorators to be used with user-defined functions. 18 | 19 | A user-defined function is a function that gets applied as a filter or an 20 | Each function for each tuple, or the reduce-side function for tuples in a 21 | grouping in an Every Cascading operation. 22 | 23 | UDFs can emit a new set of tuples (as in a Function after an Each operation), 24 | keep or filter out tuples (a Filter after an Each), or emit aggregate values 25 | (an Aggregator or Buffer for a group after an Every). 26 | 27 | We use globally or locally scoped Python functions to perform these 28 | user-defined operations. When building the data processing pipeline, we can 29 | simply stream data through a Python function with PyCascading if it was 30 | decorated by one of the decorators. 31 | 32 | * A udf_'map' function is executed for each input tuple, and returns no, one, or 33 | several new output tuples. 34 | 35 | * A 'udf_filter' is a boolean-valued function, which should return true if the 36 | input tuple should be kept for the output, and false if not. 37 | 38 | * A 'udf_buffer' is a function that is applied to groups of tuples, and is the 39 | equivalent of a Cascading Buffer. It returns an aggregate after iterating 40 | through the tuples in the group. 41 | 42 | Exports the following: 43 | udf 44 | yields 45 | numargs_expected 46 | python_list_expected 47 | python_dict_expected 48 | collects_output 49 | produces_python_list 50 | produces_tuples 51 | udf_filter 52 | udf_map 53 | udf_buffer 54 | """ 55 | 56 | __author__ = 'Gabor Szabo' 57 | 58 | import inspect 59 | 60 | from pycascading.pipe import DecoratedFunction 61 | from com.twitter.pycascading import CascadingBaseOperationWrapper 62 | from com.twitter.pycascading import CascadingRecordProducerWrapper 63 | 64 | 65 | def _function_decorator(args, kwargs, defaults={}): 66 | """ 67 | A decorator to recursively decorate a function with arbitrary attributes. 68 | """ 69 | 70 | def fun_decorator(function_or_callabledict): 71 | if isinstance(function_or_callabledict, DecoratedFunction): 72 | # Another decorator is next 73 | dff = function_or_callabledict 74 | else: 75 | # The original function comes next 76 | dff = DecoratedFunction.decorate_function(function_or_callabledict) 77 | # Add the attributes to the decorated function 78 | dff.decorators.update(additional_parameters) 79 | return dff 80 | 81 | additional_parameters = dict(defaults) 82 | additional_parameters.update(kwargs) 83 | if len(args) == 1 and not kwargs and (inspect.isroutine(args[0]) or isinstance(args[0], DecoratedFunction)): 84 | # We used the decorator without ()s, the first argument is the 85 | # function. We cannot use additional parameters in this case. 86 | return fun_decorator(args[0]) 87 | else: 88 | return fun_decorator 89 | 90 | 91 | def udf(*args, **kwargs): 92 | """The function can receive tuples or groups of tuples from Cascading. 93 | 94 | This is the decorator to use when we have a function that we want to use 95 | in a Cascading job after an Each or Every. 96 | """ 97 | return _function_decorator(args, kwargs) 98 | 99 | 100 | def yields(*args, **kwargs): 101 | """The function is a generator that yields output tuples. 102 | 103 | PyCascading considers this function a generator that yields one or more 104 | output tuples before returning. If this decorator is not used, the way the 105 | function emits tuples is determined automatically at runtime the first time 106 | the funtion is called. The alternative to yielding values is to return 107 | one tuple with return. 108 | 109 | We can safely yield Nones or not yield anything at all; no output tuples 110 | will be emitted in this case. 111 | """ 112 | return _function_decorator(args, kwargs, \ 113 | { 'output_method' : CascadingRecordProducerWrapper.OutputMethod.YIELDS }) 114 | 115 | 116 | def numargs_expected(num, *args, **kwargs): 117 | """The function expects a num number of fields in the input tuples. 118 | 119 | Arguments: 120 | num -- the exact number of fields that the input tuples must have 121 | """ 122 | return _function_decorator(args, kwargs, { 'numargs_expected' : num }) 123 | 124 | 125 | def python_list_expected(*args, **kwargs): 126 | """PyCascading will pass in the input tuples as Python lists. 127 | 128 | There is some performance penalty as all the incoming tuples need to be 129 | converted to Python lists. 130 | """ 131 | params = dict(kwargs) 132 | params.update() 133 | return _function_decorator(args, kwargs, { 'input_conversion' : \ 134 | CascadingBaseOperationWrapper.ConvertInputTuples.PYTHON_LIST }) 135 | 136 | 137 | def python_dict_expected(*args, **kwargs): 138 | """The input tuples are converted to Python dicts for this function. 139 | 140 | PyCascading will convert all input tuples to a Python dict for this 141 | function. The keys of the dict are the Cascading field names and the values 142 | are the values read from the tuple. 143 | 144 | There is some performance penalty as all the incoming tuples need to be 145 | converted to Python dicts. 146 | """ 147 | return _function_decorator(args, kwargs, { 'input_conversion' : \ 148 | CascadingBaseOperationWrapper.ConvertInputTuples.PYTHON_DICT }) 149 | 150 | 151 | def collects_output(*args, **kwargs): 152 | """The function expects an output collector where output tuples are added. 153 | 154 | PyCascading will pass in a Cascading TupleEntryCollector to which the 155 | function can add output tuples by calling its 'add' method. 156 | 157 | Use this if performance is important, as no conversion takes place between 158 | Python objects and Cascading tuples. 159 | """ 160 | return _function_decorator(args, kwargs, { 'output_method' : \ 161 | CascadingRecordProducerWrapper.OutputMethod.COLLECTS }) 162 | 163 | 164 | def produces_python_list(*args, **kwargs): 165 | """The function emits Python lists as tuples. 166 | 167 | These will be converted by PyCascading to Cascading Tuples, so this impacts 168 | performance somewhat. 169 | """ 170 | return _function_decorator(args, kwargs, { 'output_type' : \ 171 | CascadingRecordProducerWrapper.OutputType.PYTHON_LIST }) 172 | 173 | 174 | def produces_tuples(*args, **kwargs): 175 | """The function emits native Cascading Tuples or TupleEntrys. 176 | 177 | No conversion takes place so this is a fast way to add tuples to the 178 | output. 179 | """ 180 | return _function_decorator(args, kwargs, { 'output_type' : \ 181 | CascadingRecordProducerWrapper.OutputType.TUPLE }) 182 | 183 | 184 | def udf_filter(*args, **kwargs): 185 | """This makes the function a filter. 186 | 187 | The function should return 'true' for each input tuple that should stay 188 | in the output stream, and 'false' if it is to be removed. 189 | 190 | IMPORTANT: this behavior is the opposite of what Cascading expects, but 191 | similar to how the Python filter works! 192 | 193 | Note that the same effect can be attained by a map that returns the tuple 194 | itself or None if it should be filtered out. 195 | """ 196 | return _function_decorator(args, kwargs, { 'type' : 'filter' }) 197 | 198 | 199 | def udf_map(*args, **kwargs): 200 | """The function decorated with this emits output tuples for each input one. 201 | 202 | The function is called for all the tuples in the input stream as happens 203 | in a Cascading Each. The function input tuple is passed in to the function 204 | as the first parameter and is a native Cascading TupleEntry unless the 205 | python_list_expected or python_dict_expected decorators are also used. 206 | 207 | If collects_output is used, the 2nd parameter is a Cascading 208 | TupleEntryCollector to which Tuples or TupleEntrys can be added. Otherwise, 209 | the function may return an output tuple or yield any number of tuples if 210 | it is a generator. 211 | 212 | Whether the function yields or returns will be determined automatically if 213 | no decorators used that specify this, and so will be the output tuple type 214 | (it can be Python list or a Cascading Tuple). 215 | 216 | Note that the meaning of 'map' used here is closer to the Python map() 217 | builtin than the 'map' in MapReduce. It essentially means that each input 218 | tuple needs to be transformed (mapped) by a custom function. 219 | 220 | Arguments: 221 | produces -- a list of output field names 222 | """ 223 | return _function_decorator(args, kwargs, { 'type' : 'map' }) 224 | 225 | 226 | def udf_buffer(*args, **kwargs): 227 | """The function decorated with this takes a group and emits aggregates. 228 | 229 | A udf_buffer function must follow a Cascading Every operation, which comes 230 | after a GroupBy. The function will be called for each grouping on a 231 | different reducer. The first parameter passed to the function is the 232 | value of the grouping field for this group, and the second is an iterator 233 | to the tuples belonging to this group. 234 | 235 | Note that the iterator always points to a static variable in Cascading 236 | that holds a copy of the current TupleEntry, thus we cannot cache this for 237 | subsequent operations in the function. Instead, take iterator.getTuple() or 238 | create a new TupleEntry by deep copying the item in the loop. 239 | 240 | Cascading also doesn't automatically add the group field to the output 241 | tuples, so we need to do it manually. In fact a Cascading Buffer is more 242 | powerful than an aggregator, although it can be used as one. It acts more 243 | like a function emitting arbitrary tuples for groups, rather than just a 244 | simple aggregator. 245 | 246 | By default the output tuples will be what the buffer returns or yields, 247 | and the grouping fields won't be included. This is different from the 248 | aggregators' behavior, which add the output fields to the grouping fields. 249 | 250 | Also, only one buffer may follow a GroupBy, in contrast to aggregators, of 251 | which many may be present. 252 | 253 | See http://groups.google.com/group/cascading-user/browse_thread/thread/f5e5f56f6500ed53/f55fdd6bba399dcf?lnk=gst&q=scope#f55fdd6bba399dcf 254 | """ 255 | return _function_decorator(args, kwargs, { 'type' : 'buffer' }) 256 | 257 | 258 | def unwrap(*args, **kwargs): 259 | """Unwraps the tuple into function parameters before calling the function. 260 | 261 | This is not implemented on the Java side yet. 262 | """ 263 | return _function_decorator(args, kwargs, { 'parameters' : 'unwrap' }) 264 | 265 | def tuplein(*args, **kwargs): 266 | return _function_decorator(args, kwargs, { 'parameters' : 'tuple' }) 267 | -------------------------------------------------------------------------------- /python/pycascading/each.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Operations related to an Each pipe. 17 | 18 | * Add fields to the stream: map_add 19 | * Map fields to new fields: map_replace 20 | * Map the whole tuple to the new tuple: map_to 21 | * Filter tuples: filter_by 22 | """ 23 | 24 | __author__ = 'Gabor Szabo' 25 | 26 | 27 | import inspect 28 | 29 | import cascading.pipe 30 | from cascading.tuple import Fields 31 | 32 | from com.twitter.pycascading import CascadingFunctionWrapper, \ 33 | CascadingFilterWrapper 34 | 35 | from pycascading.pipe import Operation, coerce_to_fields, wrap_function, \ 36 | random_pipe_name, DecoratedFunction 37 | from pycascading.decorators import udf 38 | 39 | 40 | class _Each(Operation): 41 | 42 | """The equivalent of Each in Cascading. 43 | 44 | We need to wrap @maps and @filters with different Java classes, but 45 | the constructors for Each are built similarly. This class provides this 46 | functionality. 47 | """ 48 | 49 | def __init__(self, function_type, *args): 50 | """Build the Each constructor for the Python function. 51 | 52 | Arguments: 53 | function_type -- CascadingFunctionWrapper or CascadingFilterWrapper, 54 | whether we are calling Each with a function or filter 55 | *args -- the arguments passed on to Cascading Each 56 | """ 57 | Operation.__init__(self) 58 | 59 | self.__function = None 60 | # The default argument selector is Fields.ALL (per Cascading sources 61 | # for Operator.java) 62 | self.__argument_selector = None 63 | # The default output selector is Fields.RESULTS (per Cascading sources 64 | # for Operator.java) 65 | self.__output_selector = None 66 | 67 | if len(args) == 1: 68 | self.__function = args[0] 69 | elif len(args) == 2: 70 | (self.__argument_selector, self.__function) = args 71 | elif len(args) == 3: 72 | (self.__argument_selector, self.__function, 73 | self.__output_selector) = args 74 | else: 75 | raise Exception('The number of parameters to Apply/Filter ' \ 76 | 'should be between 1 and 3') 77 | # This is the Cascading Function type 78 | self.__function = wrap_function(self.__function, function_type) 79 | 80 | def _create_with_parent(self, parent): 81 | args = [] 82 | if self.__argument_selector: 83 | args.append(coerce_to_fields(self.__argument_selector)) 84 | args.append(self.__function) 85 | if self.__output_selector: 86 | args.append(coerce_to_fields(self.__output_selector)) 87 | # We need to put another Pipe after the Each since otherwise 88 | # joins may not work as the names of pipes apparently have to be 89 | # different for Cascading. 90 | each = cascading.pipe.Each(parent.get_assembly(), *args) 91 | return cascading.pipe.Pipe(random_pipe_name('each'), each) 92 | 93 | 94 | class Apply(_Each): 95 | """Apply the given user-defined function to each tuple in the stream. 96 | 97 | The corresponding class in Cascading is Each called with a Function. 98 | """ 99 | def __init__(self, *args): 100 | _Each.__init__(self, CascadingFunctionWrapper, *args) 101 | 102 | 103 | class Filter(_Each): 104 | """Filter the tuple stream through the user-defined function. 105 | 106 | The corresponding class in Cascading is Each called with a Filter. 107 | """ 108 | def __init__(self, *args): 109 | _Each.__init__(self, CascadingFilterWrapper, *args) 110 | 111 | 112 | def _any_instance(var, classes): 113 | """Check if var is an instance of any class in classes.""" 114 | for cl in classes: 115 | if isinstance(var, cl): 116 | return True 117 | return False 118 | 119 | 120 | def _map(output_selector, *args): 121 | """Maps the given input fields to output fields.""" 122 | if len(args) == 1: 123 | (input_selector, function, output_field) = \ 124 | (Fields.ALL, args[0], Fields.UNKNOWN) 125 | elif len(args) == 2: 126 | if inspect.isfunction(args[0]) or _any_instance(args[0], \ 127 | (DecoratedFunction, cascading.operation.Function, cascading.operation.Filter)): 128 | # The first argument is a function, the second is the output fields 129 | (input_selector, function, output_field) = \ 130 | (Fields.ALL, args[0], args[1]) 131 | else: 132 | # The first argument is the input tuple argument selector, 133 | # the second one is the function 134 | (input_selector, function, output_field) = \ 135 | (args[0], args[1], Fields.UNKNOWN) 136 | elif len(args) == 3: 137 | (input_selector, function, output_field) = args 138 | else: 139 | raise Exception('map_{add,replace} needs to be called with 1 to 3 parameters') 140 | if isinstance(function, DecoratedFunction): 141 | # By default we take everything from the UDF's decorators 142 | df = function 143 | if output_field != Fields.UNKNOWN: 144 | # But if we specified the output fields for the map, use that 145 | df = DecoratedFunction.decorate_function(function.decorators['function']) 146 | df.decorators = dict(function.decorators) 147 | df.decorators['produces'] = output_field 148 | elif inspect.isfunction(function): 149 | df = udf(produces=output_field)(function) 150 | else: 151 | df = function 152 | return Apply(input_selector, df, output_selector) 153 | 154 | 155 | def map_add(*args): 156 | """Map the defined fields (or all fields), and add the results to the tuple. 157 | 158 | Note that the new field names we are adding to the tuple cannot overlap 159 | with existing field names, or Cascading will complain. 160 | """ 161 | return _map(Fields.ALL, *args) 162 | 163 | 164 | def map_replace(*args): 165 | """Map the tuple, remove the mapped fields, and add the new fields. 166 | 167 | This mapping replaces the fields mapped with the new fields that the 168 | mapping operation adds. 169 | 170 | The number of arguments to this function is between 1 and 3: 171 | * One argument: it's the map function. The output fields will be named 172 | after the 'produces' parameter if the map function is decorated, or 173 | will be Fields.UNKNOWN if it's not defined. Note that after UNKNOW field 174 | names are introduced to the tuple, all the other field names are also 175 | lost. 176 | * Two arguments: it's either the input field selector and the map function, 177 | or the map function and the output fields' names. 178 | * Three arguments: they are interpreted as the input field selector, the 179 | map function, and finally the output fields' names. 180 | """ 181 | return _map(Fields.SWAP, *args) 182 | 183 | 184 | def map_to(*args): 185 | """Map the tuple, and keep only the results returned by the function.""" 186 | return _map(Fields.RESULTS, *args) 187 | 188 | 189 | def filter_by(function): 190 | if isinstance(function, DecoratedFunction): 191 | # We make sure we will treat the function as a filter 192 | # Here we make a copy of the decorators so that we don't overwrite 193 | # the original parameters 194 | if function.decorators['type'] not in ('filter', 'auto'): 195 | raise Exception('Function is not a filter') 196 | df = DecoratedFunction.decorate_function(function.decorators['function']) 197 | df.decorators = dict(function.decorators) 198 | df.decorators['type'] = 'filter' 199 | else: 200 | df = udf(type='filter')(function) 201 | return Filter(df) 202 | -------------------------------------------------------------------------------- /python/pycascading/every.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Operations related to an Every pipe.""" 17 | 18 | __author__ = 'Gabor Szabo' 19 | 20 | 21 | import inspect 22 | 23 | import cascading.pipe 24 | import cascading.operation 25 | from cascading.tuple import Fields 26 | 27 | from com.twitter.pycascading import CascadingAggregatorWrapper, \ 28 | CascadingBufferWrapper 29 | 30 | from pycascading.pipe import Operation, coerce_to_fields, wrap_function, \ 31 | random_pipe_name, DecoratedFunction, _Stackable 32 | 33 | 34 | class Every(Operation): 35 | 36 | """Apply an operation to a group of tuples. 37 | 38 | This operation is similar to Apply, but can only follow a GroupBy or 39 | CoGroup. It runs a Cascading Aggregator or Buffer on every grouping. 40 | Native Java aggregators or buffers may be used, and also PyCascading 41 | @reduces. 42 | 43 | By default the tuples contain only the values in a group, but not the 44 | grouping field. This can be had from the group first parameter. 45 | """ 46 | 47 | def __init__(self, *args, **kwargs): 48 | """Create a Cascading Every pipe. 49 | 50 | Keyword arguments: 51 | aggregator -- a Cascading aggregator (only either aggregator or buffer 52 | should be used) 53 | buffer -- a Cascading Buffer or a PyCascading @reduce function 54 | output_selector -- the outputSelector parameter for Cascading 55 | argument_selector -- the argumentSelector parameter for Cascading 56 | assertion_level -- the assertionLevel parameter for Cascading 57 | assertion -- the assertion parameter for Cascading 58 | """ 59 | Operation.__init__(self) 60 | self.__args = args 61 | self.__kwargs = kwargs 62 | 63 | def __create_args(self, 64 | pipe=None, 65 | aggregator=None, output_selector=None, 66 | assertion_level=None, assertion=None, 67 | buffer=None, 68 | argument_selector=None): 69 | if self.__args: 70 | # If we pass in an unnamed argument, try to determine its type 71 | if isinstance(self.__args[0], cascading.operation.Aggregator): 72 | aggregator = self.__args[0] 73 | else: 74 | buffer = self.__args[0] 75 | # Set up some defaults 76 | if argument_selector is None: 77 | argument_selector = cascading.tuple.Fields.ALL 78 | if output_selector is None: 79 | if aggregator is not None: 80 | # In the case of aggregators, we want to return both the 81 | # groupings and the results 82 | output_selector = cascading.tuple.Fields.ALL 83 | else: 84 | output_selector = cascading.tuple.Fields.RESULTS 85 | 86 | args = [] 87 | args.append(pipe.get_assembly()) 88 | if argument_selector is not None: 89 | args.append(coerce_to_fields(argument_selector)) 90 | if aggregator is not None: 91 | # for now we assume it's a Cascading aggregator straight 92 | args.append(wrap_function(aggregator, CascadingAggregatorWrapper)) 93 | if output_selector: 94 | args.append(coerce_to_fields(output_selector)) 95 | if assertion_level is not None: 96 | args.append(assertion_level) 97 | args.append(assertion) 98 | if buffer is not None: 99 | args.append(wrap_function(buffer, CascadingBufferWrapper)) 100 | if output_selector: 101 | args.append(coerce_to_fields(output_selector)) 102 | return args 103 | 104 | def _create_with_parent(self, parent): 105 | args = self.__create_args(pipe=parent, **self.__kwargs) 106 | return cascading.pipe.Every(*args) 107 | 108 | 109 | class GroupBy(Operation): 110 | 111 | """GroupBy first merges the given pipes, then groups by the fields given. 112 | 113 | This class does the same as the corresponding Cascading GroupBy. 114 | """ 115 | 116 | def __init__(self, *args, **kwargs): 117 | """Create a Cascading Every pipe. 118 | 119 | Arguments: 120 | args[0] -- the fields on which to group 121 | 122 | Keyword arguments: 123 | group_name -- the groupName parameter for Cascading 124 | group_fields -- the fields on which to group 125 | sort_fields -- the sortFields parameter for Cascading 126 | reverse_order -- the reverseOrder parameter for Cascading 127 | lhs_pipe -- the lhsPipe parameter for Cascading 128 | rhs_pipe -- the rhsPipe parameter for Cascading 129 | """ 130 | Operation.__init__(self) 131 | self.__args = args 132 | self.__kwargs = kwargs 133 | 134 | def __create_args(self, 135 | group_name=None, 136 | pipes=None, group_fields=None, sort_fields=None, 137 | reverse_order=None, 138 | pipe=None, 139 | lhs_pipe=None, rhs_pipe=None): 140 | # We can use an unnamed parameter only for group_fields 141 | if self.__args: 142 | group_fields = coerce_to_fields(self.__args[0]) 143 | args = [] 144 | if group_name: 145 | args.append(group_name) 146 | if pipes: 147 | args.append([p.get_assembly() for p in pipes]) 148 | if group_fields: 149 | args.append(coerce_to_fields(group_fields)) 150 | if sort_fields: 151 | args.append(coerce_to_fields(sort_fields)) 152 | if reverse_order: 153 | args.append(reverse_order) 154 | elif pipe: 155 | args.append(pipe.get_assembly()) 156 | if group_fields: 157 | args.append(coerce_to_fields(group_fields)) 158 | if sort_fields: 159 | args.append(coerce_to_fields(sort_fields)) 160 | if reverse_order: 161 | args.append(reverse_order) 162 | elif lhs_pipe: 163 | args.append(lhs_pipe.get_assembly()) 164 | args.append(rhs_pipe.get_assembly()) 165 | args.append(coerce_to_fields(group_fields)) 166 | return args 167 | 168 | def _create_with_parent(self, parent): 169 | if isinstance(parent, _Stackable): 170 | # We're chaining with a _Stackable object 171 | args = self.__create_args(pipes=parent.stack, **self.__kwargs) 172 | else: 173 | # We're chaining with a Chainable object 174 | args = self.__create_args(pipe=parent, **self.__kwargs) 175 | return cascading.pipe.GroupBy(*args) 176 | 177 | 178 | class _DelayedInitialization(Operation): 179 | def __init__(self, callback): 180 | Operation.__init__(self) 181 | self.__callback = callback 182 | 183 | def _create_with_parent(self, parent): 184 | return self.__callback(parent).get_assembly() 185 | 186 | 187 | def group_by(*args, **kwargs): 188 | if len(args) == 0: 189 | grouping_fields = None 190 | parameters = () 191 | elif len(args) == 1: 192 | grouping_fields = args[0] 193 | parameters = () 194 | elif len(args) == 2: 195 | grouping_fields = args[0] 196 | parameters = (Fields.ALL, args[1], Fields.UNKNOWN) 197 | elif len(args) == 3: 198 | grouping_fields = args[0] 199 | if inspect.isfunction(args[1]) or isinstance(args[1], \ 200 | (DecoratedFunction, cascading.operation.Aggregator, cascading.operation.Buffer)): 201 | # The first argument is an aggregator/buffer, 202 | # the second is the output fields 203 | parameters = (Fields.ALL, args[1], args[2]) 204 | else: 205 | parameters = (args[1], args[2], Fields.UNKNOWN) 206 | elif len(args) == 4: 207 | grouping_fields = args[0] 208 | parameters = (args[1], args[2], args[3]) 209 | else: 210 | raise Exception('group_by needs to be called with 1 to 4 parameters') 211 | 212 | if parameters: 213 | (input_selector, function, output_field) = parameters 214 | if isinstance(function, DecoratedFunction): 215 | # By default we take everything from the UDF's decorators 216 | df = function 217 | if output_field != Fields.UNKNOWN: 218 | # But if we specified the output fields for the map, use that 219 | df = DecoratedFunction.decorate_function(function.decorators['function']) 220 | df.decorators = dict(function.decorators) 221 | df.decorators['produces'] = output_field 222 | elif inspect.isfunction(function): 223 | df = udf(produces=output_field)(function) 224 | else: 225 | df = function 226 | def pipe(parent): 227 | if grouping_fields: 228 | return parent | GroupBy(grouping_fields, **kwargs) | \ 229 | Every(df, argument_selector=input_selector) 230 | else: 231 | return parent | GroupBy(**kwargs) | \ 232 | Every(df, argument_selector=input_selector) 233 | return _DelayedInitialization(pipe) 234 | else: 235 | def pipe(parent): 236 | if grouping_fields: 237 | return parent | GroupBy(grouping_fields, **kwargs) 238 | else: 239 | return parent | GroupBy(**kwargs) 240 | return _DelayedInitialization(pipe) 241 | -------------------------------------------------------------------------------- /python/pycascading/helpers.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Helper functions for a PyCascading script. 17 | 18 | This module imports the PyCascading modules so that we don't have to import 19 | them manually all the time. It also imports the Java classes wrapping the 20 | primitive types (Integer, Long, Float, Double, String), so that casts are made 21 | easy. Furthermore frequently used Cascading classes are also imported, such as 22 | Fields, Tuple, and TupleEntry, and the pre-defined aggregators, filters, 23 | assemblies, and schemes. 24 | """ 25 | 26 | __author__ = 'Gabor Szabo' 27 | 28 | 29 | import time, struct, subprocess 30 | 31 | # Import frequently used Cascading classes 32 | # We import these first so that we can override some global names (like Rename) 33 | from cascading.tuple import Fields, Tuple, TupleEntry 34 | from cascading.operation.aggregator import * 35 | from cascading.operation.filter import * 36 | from cascading.pipe.assembly import * 37 | from cascading.scheme import * 38 | from cascading.tap import * 39 | 40 | # Import all important PyCascading modules so we don't have to in the scripts 41 | from pycascading.decorators import * 42 | from pycascading.tap import * 43 | from pycascading.operators import * 44 | from pycascading.each import * 45 | from pycascading.every import * 46 | from pycascading.cogroup import * 47 | # We don't import * as the name of some functions (sum) collides with Python 48 | import pycascading.native as native 49 | 50 | # Import Java basic types for conversions 51 | from java.lang import Integer, Long, Float, Double, String 52 | 53 | import com.twitter.pycascading.SelectFields 54 | from pycascading.pipe import coerce_to_fields 55 | 56 | 57 | class Getter(): 58 | 59 | """A wrapper for an object with 'get' and 'set' methods. 60 | 61 | If the object has a .get(key) method and a .set(key, value) method, 62 | these can be replaced by referencing the key with []s. 63 | """ 64 | 65 | def __init__(self, object): 66 | self.object = object 67 | 68 | def __getitem__(self, key): 69 | return self.object.get(key) 70 | 71 | def __setitem__(self, key, value): 72 | return self.object.set(key, value) 73 | 74 | 75 | def time2epoch(t): 76 | """Converts times in UTC to seconds since the UNIX epoch, 1/1/1970 00:00. 77 | 78 | Arguments: 79 | t -- the time string in 'YYYY-MM-DD hh:mm:ss' format 80 | 81 | Exceptions: 82 | Throws an exception if t is not in the right format. 83 | """ 84 | t = time.strptime(t + ' UTC', '%Y-%m-%d %H:%M:%S.0 %Z') 85 | return int(time.mktime(t)) - time.timezone 86 | 87 | 88 | def bigendian2long(b): 89 | """Converts a series of 4 bytes in big-endian format to a Java Long. 90 | 91 | Arguments: 92 | b -- a string of 4 bytes that represent a word 93 | """ 94 | return Long(struct.unpack('>I', b)[0]) 95 | 96 | 97 | def bigendian2int(b): 98 | """Converts a series of 4 bytes in big-endian format to a Python int. 99 | 100 | Arguments: 101 | b -- a string of 4 bytes that represent a word 102 | """ 103 | return struct.unpack('>i', b)[0] 104 | 105 | 106 | def SelectFields(fields): 107 | """Keeps only some fields in the tuple stream. 108 | 109 | Arguments: 110 | fields -- a list of fields to keep, or a Cascading Fields wildcard 111 | """ 112 | return com.twitter.pycascading.SelectFields(coerce_to_fields(fields)) 113 | 114 | 115 | def read_hdfs_tsv_file(path): 116 | """Read a tab-separated HDFS folder and yield the records. 117 | 118 | The first line of the file should contain the name of the fields. Each 119 | record contains columns separated by tabs. 120 | 121 | Arguments: 122 | path -- path to a tab-separated folder containing the data files 123 | """ 124 | pipe = subprocess.Popen('hdfs -cat "%s/.pycascading_header" "%s/part-*"' \ 125 | % (path, path), shell=True, stdout=subprocess.PIPE).stdout 126 | first_line = True 127 | for line in pipe: 128 | line = line[0 : (len(line) - 1)] 129 | fields = line.split('\t') 130 | if first_line: 131 | field_names = fields 132 | first_line = False 133 | else: 134 | yield dict(zip(field_names, fields)) 135 | -------------------------------------------------------------------------------- /python/pycascading/init_module.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Used internally. PyCascading module to set up the paths for the sources. 17 | 18 | The module that gets loaded first when a Cascading pipeline is deserialized. 19 | PyCascading needs to start a Jython interpreter whenever a mapper or reducer 20 | executes Python code, so we need to start an interpreter, set up the 21 | environment, and load the job's source code. 22 | """ 23 | 24 | __author__ = 'Gabor Szabo' 25 | 26 | 27 | import sys 28 | 29 | 30 | def setup_paths(module_paths): 31 | """Set up sys.path on the mappers and reducers. 32 | 33 | module_paths is an array of path names where the sources or other 34 | supporting files are found. In particular, module_paths[0] is the location 35 | of the PyCascading Python sources, and modules_paths[1] is the location of 36 | the source file defining the function. 37 | 38 | In Hadoop mode (with remote_deploy.sh), the first two -a options must 39 | specify the archives of the PyCascading sources and the job sources, 40 | respectively. 41 | 42 | Arguments: 43 | module_paths -- the locations of the Python sources 44 | """ 45 | from com.twitter.pycascading import Util 46 | 47 | cascading_jar = Util.getCascadingJar() 48 | jython_dir = module_paths[0] 49 | 50 | sys.path.extend((cascading_jar, jython_dir + '/python', 51 | jython_dir + '/python/Lib')) 52 | sys.path.extend(module_paths[1 : ]) 53 | 54 | # Allow importing of user-installed Jython packages 55 | # Thanks to Simon Radford 56 | import site 57 | site.addsitedir(jython_dir + 'python/Lib/site-packages') 58 | 59 | # Haha... it's necessary to put this here, otherwise simplejson won't work. 60 | # Maybe it's automatically imported in the beginning of a Jython program, 61 | # but since at that point the sys.path is not set yet to Lib, it will fail? 62 | #import encodings 63 | -------------------------------------------------------------------------------- /python/pycascading/native.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Aggregators, filters, functions, and assemblies adapted to PyCascading. 17 | 18 | These useful operations are provided by Cascading. 19 | """ 20 | 21 | __author__ = 'Gabor Szabo' 22 | 23 | 24 | import cascading.operation.aggregator as aggregator 25 | import cascading.operation.filter as filter 26 | import cascading.operation.function as function 27 | import cascading.pipe.assembly as assembly 28 | 29 | from pycascading.pipe import coerce_to_fields, SubAssembly 30 | 31 | 32 | def average(*args): 33 | args = list(args) 34 | if args: 35 | args[0] = coerce_to_fields(args[0]) 36 | return aggregator.Average(*args) 37 | 38 | 39 | def count(*args): 40 | args = list(args) 41 | if args: 42 | args[0] = coerce_to_fields(args[0]) 43 | return aggregator.Count(*args) 44 | 45 | 46 | def first(*args): 47 | args = list(args) 48 | if args: 49 | args[0] = coerce_to_fields(args[0]) 50 | return aggregator.First(*args) 51 | 52 | 53 | def last(*args): 54 | args = list(args) 55 | if args: 56 | args[0] = coerce_to_fields(args[0]) 57 | return aggregator.Last(*args) 58 | 59 | 60 | def max(*args): 61 | args = list(args) 62 | if args: 63 | args[0] = coerce_to_fields(args[0]) 64 | return aggregator.Max(*args) 65 | 66 | 67 | def min(*args): 68 | args = list(args) 69 | if args: 70 | args[0] = coerce_to_fields(args[0]) 71 | return aggregator.Min(*args) 72 | 73 | 74 | def sum(*args): 75 | args = list(args) 76 | if args: 77 | args[0] = coerce_to_fields(args[0]) 78 | return aggregator.Sum(*args) 79 | 80 | 81 | def limit(lim): 82 | return filter.Limit(lim) 83 | 84 | 85 | def sample(*args): 86 | return filter.Sample(lim) 87 | 88 | 89 | def un_group(*args): 90 | args = list(args) 91 | if args: 92 | args[0] = coerce_to_fields(args[0]) 93 | if len(args) > 1: 94 | if isinstance(args[1], (list, tuple)): 95 | new_arg = [] 96 | for f in args[1]: 97 | new_arg.append(coerce_to_fields(f)) 98 | args[1] = new_arg 99 | else: 100 | args[1] = coerce_to_fields(args[1]) 101 | if len(args) > 2: 102 | if isinstance(args[2], (list, tuple)): 103 | new_arg = [] 104 | for f in args[2]: 105 | new_arg.append(coerce_to_fields(f)) 106 | args[2] = new_arg 107 | return function.UnGroup(*args) 108 | 109 | 110 | def average_by(*args): 111 | args = list(args) 112 | if len(args) > 0: 113 | args[0] = coerce_to_fields(args[0]) 114 | if len(args) > 1: 115 | args[1] = coerce_to_fields(args[1]) 116 | if len(args) > 2: 117 | args[2] = coerce_to_fields(args[2]) 118 | return SubAssembly(assembly.AverageBy, *args) 119 | 120 | 121 | def count_by(*args): 122 | args = list(args) 123 | if len(args) > 0: 124 | args[0] = coerce_to_fields(args[0]) 125 | if len(args) > 1: 126 | args[1] = coerce_to_fields(args[1]) 127 | return SubAssembly(assembly.CountBy, *args) 128 | 129 | 130 | def sum_by(*args): 131 | # SumBy has at least 3 parameters 132 | args = list(args) 133 | for i in xrange(0, 3): 134 | args[i] = coerce_to_fields(args[i]) 135 | return SubAssembly(assembly.SumBy, *args) 136 | 137 | 138 | def unique(*args): 139 | args = list(args) 140 | args[0] = coerce_to_fields(args[0]) 141 | return SubAssembly(assembly.Unique, *args) 142 | -------------------------------------------------------------------------------- /python/pycascading/operators.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Various operations acting on the tuples. 17 | 18 | * Select fields from the stream: retain 19 | * Remove fields from the stream: discard (not implemented in Cascading 1.2.*) 20 | * Rename fields: rename 21 | """ 22 | 23 | __author__ = 'Gabor Szabo' 24 | 25 | 26 | import itertools 27 | 28 | from cascading.tuple import Fields 29 | from cascading.operation import Identity 30 | import cascading.pipe.assembly.Rename 31 | 32 | from pycascading.pipe import SubAssembly, coerce_to_fields 33 | from pycascading.each import Apply 34 | 35 | 36 | def retain(*fields_to_keep): 37 | """Retain only the given fields. 38 | 39 | The fields can be given in array or by separate parameters. 40 | """ 41 | if len(fields_to_keep) > 1: 42 | fields_to_keep = list(itertools.chain(fields_to_keep)) 43 | else: 44 | fields_to_keep = fields_to_keep[0] 45 | return Apply(fields_to_keep, Identity(Fields.ARGS), Fields.RESULTS) 46 | 47 | 48 | def _discard(fields_to_discard): 49 | # In 2.0 there's a builtin function this, Discard 50 | # In 1.2 there is nothing for this 51 | raise Exception('Discard only works with Cascading 2.0') 52 | 53 | 54 | def rename(*args): 55 | """Rename the fields to new names. 56 | 57 | If only one argument (a list of names) is given, it is assumed that the 58 | user wants to rename all the fields. If there are two arguments, the first 59 | list is the set of fields to be renamed, and the second is a list of the 60 | new names. 61 | """ 62 | if len(args) == 1: 63 | (fields_from, fields_to) = (Fields.ALL, args[0]) 64 | else: 65 | (fields_from, fields_to) = (args[0], args[1]) 66 | return SubAssembly(cascading.pipe.assembly.Rename, \ 67 | coerce_to_fields(fields_from), \ 68 | coerce_to_fields(fields_to)) 69 | -------------------------------------------------------------------------------- /python/pycascading/pipe.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Build and execute Cascading flows in Python. 17 | 18 | Flows are built from Cascading operations that reshape, join, and split 19 | streams. Some operations make use of user-defined functions, for instance, the 20 | Each operation applies an UDF to each tuple seen in the stream. 21 | 22 | Exports the following: 23 | Pipe 24 | CoGroup 25 | Join 26 | OuterJoin 27 | LeftOuterJoin 28 | RightOuterJoin 29 | SubAssembly 30 | coerce_to_fields 31 | random_pipe_name 32 | """ 33 | 34 | __author__ = 'Gabor Szabo' 35 | 36 | 37 | import types, inspect, pickle 38 | 39 | import cascading.pipe 40 | import cascading.tuple 41 | import cascading.operation 42 | import cascading.pipe.cogroup 43 | from com.twitter.pycascading import CascadingBaseOperationWrapper, \ 44 | CascadingRecordProducerWrapper 45 | 46 | import serializers 47 | 48 | from java.io import ObjectOutputStream 49 | 50 | 51 | import java.lang.Integer 52 | 53 | 54 | def coerce_to_fields(obj): 55 | """ 56 | Utility function to convert a list or field name to cascading.tuple.Fields. 57 | 58 | Arguments: 59 | obj -- a cascading.tuple.Fields, an integer, or a string, or a list of 60 | integers and/or strings identifying fields 61 | 62 | Return: 63 | obj coerced to a cascading.tuple.Fields object 64 | """ 65 | if isinstance(obj, list): 66 | # For some reason integers will not be cast to Comparables by Jython, 67 | # so we need to do it manually before calling the Fields constructor 68 | for i in xrange(len(obj)): 69 | if isinstance(obj[i], int): 70 | obj[i] = java.lang.Integer(obj[i]) 71 | return cascading.tuple.Fields(obj) 72 | elif isinstance(obj, str) or isinstance(obj, int): 73 | if isinstance(obj, int): 74 | obj = java.lang.Integer(obj) 75 | return cascading.tuple.Fields([obj]) 76 | else: 77 | # obj is assumed to be Fields already 78 | return obj 79 | 80 | 81 | def random_pipe_name(prefix): 82 | """Generate a random string that can be used to name pipes. 83 | 84 | Otherwise Cascading always gets confused. 85 | """ 86 | import random, re, traceback 87 | stack = traceback.extract_stack() 88 | stack.reverse() 89 | file = None 90 | for s in stack: 91 | if not re.match(r'.*/pycascading/[^/]+\.py$', s[0]) and \ 92 | not re.match(r'.*/bootstrap.py$', s[0]): 93 | file = s[0] 94 | line = s[1] 95 | i = file.rfind('/') 96 | if i >= 0: 97 | file = file[i + 1 :] 98 | break 99 | name = prefix 100 | if file: 101 | name = name + '/' + str(line) + ':' + file 102 | name += ' ' 103 | id = '' 104 | for i in xrange(0, 4): 105 | name += chr(random.randint(ord('a'), ord('z'))) 106 | return name 107 | 108 | 109 | def wrap_function(function, casc_function_type): 110 | """Wrap a Python function into a Serializable and callable Java object. 111 | This wrapping is necessary as Cascading serializes the job pipeline before 112 | it sends the job to the workers. We need to in essence reconstruct the 113 | Python function from source on the receiving end when we deserialize the 114 | function, as Python is an interpreted language. 115 | 116 | Arguments: 117 | function -- either a Cascading Operation, a PyCascading-decorated Python 118 | function, or a native Python function 119 | casc_function_type -- the Cascading Operation that this Python function 120 | will be called by in its operate method 121 | """ 122 | if isinstance(function, cascading.operation.Operation): 123 | return function 124 | if isinstance(function, DecoratedFunction): 125 | # Build the arguments for the constructor 126 | args = [] 127 | decorators = function.decorators 128 | if 'numargs_expected' in decorators: 129 | args.append(decorators['numargs_expected']) 130 | if 'produces' in decorators and decorators['produces']: 131 | args.append(coerce_to_fields(decorators['produces'])) 132 | # Create the appropriate type (function or filter) 133 | fw = casc_function_type(*args) 134 | function = decorators['function'] 135 | fw.setConvertInputTuples(decorators['input_conversion']) 136 | if decorators['type'] in set(['map', 'buffer', 'auto']): 137 | fw.setOutputMethod(decorators['output_method']) 138 | fw.setOutputType(decorators['output_type']) 139 | fw.setContextArgs(decorators['args']) 140 | fw.setContextKwArgs(decorators['kwargs']) 141 | else: 142 | # When function is a pure Python function, declared without decorators 143 | fw = casc_function_type() 144 | fw.setFunction(function) 145 | fw.setWriteObjectCallBack(serializers.replace_object) 146 | return fw 147 | 148 | 149 | class _Stackable(object): 150 | 151 | """An object that can be chained with '&' operations.""" 152 | 153 | def __init__(self): 154 | self.stack = [self] 155 | 156 | def __and__(self, other): 157 | result = _Stackable() 158 | result.stack = self.stack + other.stack 159 | return result 160 | 161 | def __or__(self, other): 162 | result = Chainable() 163 | result._assembly = other._create_with_parent(self) 164 | for s in self.stack: 165 | result.add_context(s.context) 166 | return result 167 | 168 | 169 | class Chainable(_Stackable): 170 | 171 | """An object that can be chained with '|' operations.""" 172 | 173 | def __init__(self): 174 | _Stackable.__init__(self) 175 | self._assembly = None 176 | self.context = set() 177 | self.hash = 0 178 | 179 | def add_context(self, ctx): 180 | # TODO: see if context is indeed needed 181 | """ 182 | This is used to keep track of the sources connected to this pipeline 183 | so that a possible cache can remove them for Cascading. 184 | """ 185 | # Cannot use extend because of the strings 186 | self.context.update(ctx) 187 | 188 | def get_assembly(self): 189 | """Return the Cascading Pipe instance that this object represents.""" 190 | if self._assembly == None: 191 | self._assembly = self._create_without_parent() 192 | return self._assembly 193 | 194 | def __or__(self, other): 195 | result = Chainable() 196 | if isinstance(other, cascading.operation.Aggregator): 197 | import every 198 | other = every.Every(aggregator=other) 199 | elif isinstance(other, cascading.operation.Function): 200 | import each 201 | other = each.Apply(other) 202 | elif isinstance(other, cascading.operation.Filter): 203 | import each 204 | other = each.Apply(other) 205 | elif inspect.isroutine(other): 206 | other = DecoratedFunction.decorate_function(other) 207 | if isinstance(other, Chainable): 208 | result._assembly = other._create_with_parent(self) 209 | result.add_context(self.context) 210 | result.hash = self.hash ^ hash(result._assembly) 211 | return result 212 | 213 | def _create_without_parent(self): 214 | """Called when the Chainable is the first member of a chain. 215 | 216 | We want to initialize the chain with this operation as the first 217 | member. 218 | """ 219 | raise Exception('Cannot create without parent') 220 | 221 | def _create_with_parent(self, parent): 222 | """Called when the Chainable is NOT the first member of a chain. 223 | 224 | Takes a PyCascading Pipe object, or a list thereof, and returns a 225 | corresponding Cascading Pipe instance. 226 | 227 | Arguments: 228 | parent -- the PyCascading pipe that we need to append this operation to 229 | """ 230 | raise Exception('Cannot create with parent') 231 | 232 | 233 | class Pipe(Chainable): 234 | 235 | """The basic PyCascading Pipe object. 236 | 237 | This represents an operation on the tuple stream. A Pipe object can has an 238 | upstream parent (unless it is a source), and a downstream child (unless it 239 | is a sink). 240 | """ 241 | 242 | def __init__(self, name=None, *args): 243 | Chainable.__init__(self) 244 | if name: 245 | self.__name = name 246 | else: 247 | self.__name = 'unnamed' 248 | 249 | def _create_without_parent(self): 250 | """ 251 | Create the Cascading operation when this is the first element of a 252 | chain. 253 | """ 254 | return cascading.pipe.Pipe(self.__name) 255 | 256 | def _create_with_parent(self, parent): 257 | """ 258 | Create the Cascading operation when this is not the first element 259 | of a chain. 260 | """ 261 | return cascading.pipe.Pipe(self.__name, parent.get_assembly()) 262 | 263 | 264 | class Operation(Chainable): 265 | 266 | """A common base class for all operations (Functions, Filters, etc.). 267 | 268 | It doesn't do anything just provides the class. 269 | """ 270 | 271 | def __init__(self): 272 | Chainable.__init__(self) 273 | 274 | 275 | class DecoratedFunction(Operation): 276 | 277 | """Decorates Python functions with arbitrary attributes. 278 | 279 | Additional attributes and the original functions are stored in a dict 280 | self.decorators. 281 | """ 282 | 283 | def __init__(self): 284 | Operation.__init__(self) 285 | self.decorators = {} 286 | 287 | def __call__(self, *args, **kwargs): 288 | """ 289 | When we call the function we don't actually want to execute it, just 290 | to store the parameters passed to it so that we can distribute them 291 | to workers as a shared context. 292 | """ 293 | args, kwargs = self._wrap_argument_functions(args, kwargs) 294 | if args: 295 | self.decorators['args'] = args 296 | if kwargs: 297 | self.decorators['kwargs'] = kwargs 298 | return self 299 | 300 | def _create_with_parent(self, parent): 301 | """ 302 | Use the appropriate operation when the function is used in the pipe. 303 | """ 304 | my_type = self.decorators['type'] 305 | if my_type == 'auto': 306 | # Determine the type of function automatically based on the parent 307 | if isinstance(parent, Chainable) and \ 308 | isinstance(parent.get_assembly(), cascading.pipe.GroupBy): 309 | my_type = 'buffer' 310 | else: 311 | raise Exception('Function was not decorated with @udf_map or' \ 312 | ' @udf_filter, and I cannot decide if it is' \ 313 | ' a map or a filter') 314 | if my_type == 'map': 315 | import each 316 | return each.Apply(self)._create_with_parent(parent) 317 | elif my_type == 'filter': 318 | import pycascading.each 319 | return pycascading.each.Filter(self)._create_with_parent(parent) 320 | elif my_type == 'buffer': 321 | import every 322 | return every.Every(buffer=self)._create_with_parent(parent) 323 | else: 324 | raise Exception('Function was not annotated with ' \ 325 | '@udf_map(), @udf_filter(), or @udf_buffer()') 326 | 327 | def _wrap_argument_functions(self, args, kwargs): 328 | """ 329 | Just like the nested function, any arguments that are functions 330 | have to be wrapped. 331 | """ 332 | args_out = [] 333 | for arg in args: 334 | if type(arg) == types.FunctionType: 335 | # args_out.append(_python_function_to_java(arg)) 336 | args_out.append(arg) 337 | else: 338 | args_out.append(arg) 339 | for key in kwargs: 340 | if type(kwargs[key]) == types.FunctionType: 341 | # kwargs[key] = _python_function_to_java(kwargs[key]) 342 | pass 343 | return (tuple(args_out), kwargs) 344 | 345 | @classmethod 346 | def decorate_function(cls, function): 347 | """Return a DecoratedFunction with the default parameters set.""" 348 | dff = DecoratedFunction() 349 | # This is the user-defined Python function 350 | dff.decorators['function'] = function 351 | # If it's used as an Each, Every, or Filter function 352 | dff.decorators['type'] = 'auto' 353 | dff.decorators['input_conversion'] = \ 354 | CascadingBaseOperationWrapper.ConvertInputTuples.NONE 355 | dff.decorators['output_method'] = \ 356 | CascadingRecordProducerWrapper.OutputMethod.YIELDS_OR_RETURNS 357 | dff.decorators['output_type'] = \ 358 | CascadingRecordProducerWrapper.OutputType.AUTO 359 | dff.decorators['args'] = None 360 | dff.decorators['kwargs'] = None 361 | return dff 362 | 363 | 364 | class SubAssembly(Operation): 365 | 366 | """Pipe for a Cascading SubAssembly. 367 | 368 | We can use it in PyCascading to make use of existing subassemblies, 369 | such as Unique. 370 | """ 371 | 372 | def __init__(self, sub_assembly_class, *args): 373 | """Create a pipe for a Cascading SubAssembly. 374 | 375 | This makes use of a cascading.pipe.SubAssembly class. 376 | 377 | Arguments: 378 | sub_assembly_class -- the Cascading SubAssembly class 379 | *args -- parameters passed on to the subassembly's constructor when 380 | it's initialized 381 | """ 382 | self.__sub_assembly_class = sub_assembly_class 383 | self.__args = args 384 | 385 | def _create_with_parent(self, parent): 386 | pipe = self.__sub_assembly_class(parent.get_assembly(), *self.__args) 387 | tails = pipe.getTails() 388 | if len(tails) == 1: 389 | result = tails[0] 390 | else: 391 | result = _Stackable() 392 | result.stack = tails 393 | return result 394 | -------------------------------------------------------------------------------- /python/pycascading/serializers.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Serialize a Python function. 17 | 18 | This module will serialize a Python function in one of two ways: 19 | * if the function is globally scoped, or a method of a class, it will 20 | serialize it by its name, the module, and class it was defined in. Note that 21 | methods of nested classes cannot be serialized, as nested classes don't hold 22 | references to their nesting class, so they cannot be reloaded from sources. 23 | * if the function is scoped locally (nested), we grab its source so that it 24 | can be reloaded on deserialization. 25 | 26 | Exports the following: 27 | replace_object 28 | """ 29 | 30 | 31 | import inspect, re, types 32 | 33 | import pipe 34 | 35 | 36 | def _remove_indents_from_function(code): 37 | """Remove leading indents from the function's source code. 38 | 39 | Otherwise an exec later when running the function would complain about 40 | the indents. 41 | """ 42 | 43 | def swap_tabs_to_spaces(line): 44 | new_line = '' 45 | for i in xrange(0, len(line)): 46 | if line[i] == ' ': 47 | new_line += line[i] 48 | elif line[i] == '\t': 49 | new_line += ' ' * 8 50 | else: 51 | new_line += line[i : len(line)] 52 | break 53 | return new_line 54 | 55 | lines = code.split('\n') 56 | indent = -1 57 | for line in lines: 58 | m = re.match('^([ \t]*)def\s.*$', line) 59 | if m: 60 | #print line, 'x', m.group(1), 'x' 61 | indent = len(swap_tabs_to_spaces(m.group(1))) 62 | break 63 | if indent < 0: 64 | raise Exception('No def found for function source') 65 | #print 'indent', indent 66 | result = '' 67 | for line in lines: 68 | line = swap_tabs_to_spaces(line) 69 | i = 0 70 | while i < len(line): 71 | if i < indent and line[i] == ' ': 72 | i += 1 73 | else: 74 | break 75 | result += line[i : len(line)] + '\n' 76 | return result 77 | 78 | 79 | def _get_source(func): 80 | """Return the source code for func.""" 81 | return _remove_indents_from_function(inspect.getsource(func)) 82 | 83 | 84 | def function_scope(func): 85 | if (not inspect.isfunction(func)) and (not inspect.ismethod(func)): 86 | raise Exception('Expecting a (non-built-in) function or method') 87 | name = func.func_name 88 | module = inspect.getmodule(func) 89 | module_name = module.__name__ 90 | if module_name == '__main__': 91 | module_name = '' 92 | enclosing_object = None 93 | if inspect.ismethod(func): 94 | if func.im_class == types.ClassType: 95 | # Function is a classmethod 96 | class_name = func.im_self.__name__ 97 | if class_name in dir(module): 98 | # Class is a top-level class in the module 99 | type = 'classmethod' 100 | source = None 101 | else: 102 | raise Exception('Class for @classmethod is nested, and Python ' 103 | 'cannot determine the nesting class, ' 104 | 'thus it\'s not allowed') 105 | else: 106 | # Function is a normal method 107 | class_name = func.im_class.__name__ 108 | enclosing_object = func.im_self 109 | if class_name in dir(module): 110 | # Class is a top-level class in the module 111 | type = 'method' 112 | source = None 113 | else: 114 | raise Exception('The method\'s class is not top-level') 115 | else: 116 | # The function is a global or nested function, but not a method in a class 117 | class_name = None 118 | if name in dir(module): 119 | # Function is a global function 120 | type = 'global' 121 | source = None 122 | else: 123 | # Function is a closure 124 | type = 'closure' 125 | source = _get_source(func) 126 | return (type, module_name, class_name, name, source) 127 | 128 | 129 | def replace_object(obj): 130 | if inspect.isfunction(obj): 131 | return function_scope(obj) 132 | else: 133 | return None 134 | -------------------------------------------------------------------------------- /python/pycascading/tap.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011 Twitter, Inc. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """Taps (sources and sinks) in PyCascading. 17 | 18 | All taps need to be registered using this module because Cascading expects 19 | them to be named by strings when running the flow. 20 | 21 | Exports the following: 22 | Flow 23 | read_hdfs_tsv_file 24 | """ 25 | 26 | __author__ = 'Gabor Szabo' 27 | 28 | 29 | from pycascading.pipe import random_pipe_name, Chainable, Pipe 30 | from com.twitter.pycascading import Util, MetaScheme 31 | 32 | import cascading.tap 33 | import cascading.scheme 34 | from cascading.tuple import Fields 35 | 36 | from org.apache.hadoop.fs import Path 37 | from org.apache.hadoop.conf import Configuration 38 | 39 | from pipe import random_pipe_name, Operation 40 | 41 | 42 | def expand_path_with_home(output_folder): 43 | """Prepend the home folder to a relative location on HDFS if necessary. 44 | 45 | Only if we specified a relative path and no scheme, prepend it with the 46 | home folder of the user on HDFS. This behavior is similar to how 47 | "hadoop fs" works. If we are running in local mode, don't do anything. 48 | 49 | Arguments: 50 | output_folder -- the absolute or relative path of the output HDFS folder 51 | """ 52 | import pycascading.pipe 53 | if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop': 54 | if not any(map(lambda scheme: output_folder.startswith(scheme), \ 55 | ['hdfs:', 'file:', 's3:', 's3n:', '/'])): 56 | fs = Path('/').getFileSystem(Configuration()) 57 | home_folder = fs.getHomeDirectory().toString() 58 | return home_folder + '/' + output_folder 59 | return output_folder 60 | 61 | 62 | class Flow(object): 63 | 64 | """Define sources and sinks for the flow. 65 | 66 | This associates all sources and sinks with their head pipe mappings. 67 | The default number of reducers is 100. Set this in the num_reducers 68 | parameter when starting the flow with run(). 69 | """ 70 | 71 | def __init__(self): 72 | self.source_map = {} 73 | self.sink_map = {} 74 | self.tails = [] 75 | 76 | def _connect_source(self, pipe_name, cascading_tap): 77 | """Add a source to the flow. 78 | 79 | Cascading needs to map taps to a pipeline with string names. This is 80 | inconvenient, but we need to keep track of these strings. We also need 81 | to count references to taps, as sometimes we need to remove pipelines 82 | due to replacement with a cache, and in this case we may also need to 83 | remove a tap. Otherwise Cascading complains about not all 84 | taps/pipelines being connected up to the flow. 85 | """ 86 | self.source_map[pipe_name] = cascading_tap 87 | 88 | def source(self, cascading_tap): 89 | """A generic source using Cascading taps. 90 | 91 | Arguments: 92 | cascading_tap -- the Cascading Scheme object to store data into 93 | """ 94 | # We can create the source tap right away and also use a Pipe to name 95 | # the head of this pipeline 96 | p = Pipe(name=random_pipe_name('source')) 97 | p.hash = hash(cascading_tap) 98 | p.add_context([p.get_assembly().getName()]) 99 | self._connect_source(p.get_assembly().getName(), cascading_tap) 100 | return p 101 | 102 | def meta_source(self, input_path): 103 | """Use data files in a folder and read the scheme from the meta file. 104 | 105 | Defines a source tap using files in input_path, which should be a 106 | (HDFS) folder. Takes care of using the appropriate scheme that was 107 | used to store the data, using meta data in the data folder. 108 | 109 | Arguments: 110 | input_path -- the HDFS folder to store data into 111 | """ 112 | input_path = expand_path_with_home(input_path) 113 | source_scheme = MetaScheme.getSourceScheme(input_path) 114 | return self.source(cascading.tap.Hfs(source_scheme, input_path)) 115 | 116 | def sink(self, cascading_scheme): 117 | """A Cascading sink using a Cascading Scheme. 118 | 119 | Arguments: 120 | cascading_scheme -- the Cascading Scheme used to store the data 121 | """ 122 | return _Sink(self, cascading_scheme) 123 | 124 | def meta_sink(self, cascading_scheme, output_path): 125 | """Store data together with meta information about the scheme used. 126 | 127 | A sink that also stores in a file information about the scheme used to 128 | store data, and human-readable descriptions in the .pycascading_header 129 | and .pycascading_types files with the field names and their types, 130 | respectively. 131 | 132 | Arguments: 133 | cascading_scheme -- the Cascading Scheme used to store data 134 | output_path -- the folder where the output tuples should be stored. 135 | If it exists, it will be erased and replaced! 136 | """ 137 | output_path = expand_path_with_home(output_path) 138 | sink_scheme = MetaScheme.getSinkScheme(cascading_scheme, output_path) 139 | return self.sink(cascading.tap.Hfs(sink_scheme, output_path, 140 | cascading.tap.SinkMode.REPLACE)) 141 | 142 | def tsv_sink(self, output_path, fields=Fields.ALL): 143 | # TODO: in local mode, do not prepend the home folder to the path 144 | """A sink to store the tuples as tab-separated values in text files. 145 | 146 | Arguments: 147 | output_path -- the folder for the output 148 | fields -- the fields to store. Defaults to all fields. 149 | """ 150 | output_path = expand_path_with_home(output_path) 151 | return self.meta_sink(cascading.scheme.TextDelimited(fields, '\t'), 152 | output_path) 153 | 154 | def binary_sink(self, output_path, fields=Fields.ALL): 155 | """A sink to store binary sequence files to store the output. 156 | 157 | This is a sink that uses the efficient Cascading SequenceFile scheme to 158 | store data. This is a serialized version of all tuples and is 159 | recommended when we want to store intermediate results for fast access 160 | later. 161 | 162 | Arguments: 163 | output_path -- the (HDFS) folder to store data into 164 | fields -- the Cascading Fields field selector of which tuple fields to 165 | store. Defaults to Fields.ALL. 166 | """ 167 | output_path = expand_path_with_home(output_path) 168 | return self.meta_sink(cascading.scheme.SequenceFile(fields), 169 | output_path) 170 | 171 | def cache(self, identifier, refresh=False): 172 | """A sink for temporary results. 173 | 174 | This caches results into a temporary folder if the folder does not 175 | exist yet. If we need to run slightly modified versions of the 176 | PyCascading script several times during testing for instance, this is 177 | very useful to store some results that can be reused without having to 178 | go through the part of the flow that generated them again. 179 | 180 | Arguments: 181 | identifier -- the unique identifier for this cache. This is used as 182 | part of the path where the temporary files are stored. 183 | refresh -- if True, we will regenerate the cache data as if it was 184 | the first time creating it 185 | """ 186 | return _Cache(self, identifier, refresh) 187 | 188 | def run(self, num_reducers=50, config=None): 189 | """Start the Cascading job. 190 | 191 | We call this when we are done building the pipeline and explicitly want 192 | to start the flow process. 193 | """ 194 | sources_used = set([]) 195 | for tail in self.tails: 196 | sources_used.update(tail.context) 197 | # Remove unused sources from the source map 198 | source_map = {} 199 | for source in self.source_map.iterkeys(): 200 | if source in sources_used: 201 | source_map[source] = self.source_map[source] 202 | tails = [t.get_assembly() for t in self.tails] 203 | import pycascading.pipe 204 | Util.run(num_reducers, pycascading.pipe.config, source_map, \ 205 | self.sink_map, tails) 206 | 207 | 208 | class _Sink(Chainable): 209 | 210 | """A PyCascading sink that can be used as the tail in a pipeline. 211 | 212 | Used internally. 213 | """ 214 | 215 | def __init__(self, taps, cascading_tap): 216 | Chainable.__init__(self) 217 | self.__cascading_tap = cascading_tap 218 | self.__taps = taps 219 | 220 | def _create_with_parent(self, parent): 221 | # We need to name every tail differently so that Cascading can assign 222 | # a tail map to all sinks. 223 | # TODO: revise this after I name every pipe part separately 224 | parent = parent | Pipe(name=random_pipe_name('sink')) 225 | self.__taps.sink_map[parent.get_assembly().getName()] = \ 226 | self.__cascading_tap 227 | self.__taps.tails.append(parent) 228 | return None 229 | 230 | 231 | class _Cache: 232 | 233 | """Act as a source or sink to store and retrieve temporary data.""" 234 | 235 | def __init__(self, taps, hdfs_folder, refresh=False): 236 | tmp_folder = 'pycascading.cache/' + hdfs_folder 237 | self.__cache_folder = expand_path_with_home(tmp_folder) 238 | self.__hdfs_folder_exists = \ 239 | self.hdfs_folder_exists(self.__cache_folder) 240 | self.__taps = taps 241 | self.__refresh = refresh 242 | 243 | def hdfs_folder_exists(self, folder): 244 | path = Path(folder) 245 | fs = path.getFileSystem(Configuration()) 246 | try: 247 | status = fs.getFileStatus(path) 248 | # TODO: there could be problems if it exists but is a simple file 249 | return status.isDir() 250 | except: 251 | return False 252 | 253 | def __or__(self, pipe): 254 | if not self.__refresh and self.__hdfs_folder_exists: 255 | # We remove all sources that are replaced by this cache, otherwise 256 | # Cascading complains about unused source taps 257 | return self.__taps.meta_source(self.__cache_folder) 258 | else: 259 | # We split the data into storing and processing pipelines 260 | pipe | Pipe(random_pipe_name('cache')) | \ 261 | self.__taps.binary_sink(self.__cache_folder) 262 | return pipe | Pipe(random_pipe_name('no_cache')) 263 | -------------------------------------------------------------------------------- /remote_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright 2011 Twitter, Inc. 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # 19 | # This script is used to deploy a PyCascading job remotely to a server 20 | # where Hadoop is installed. The variables below are the defaults. 21 | # 22 | 23 | # This is the default server where the PyCascading script will be submitted 24 | # to Hadoop. We assume we have SSH access to this server. 25 | server=localhost 26 | 27 | # This is the folder on the remote server where a temporary directory is 28 | # going to be created for the submission. $HOME is only expanded on the 29 | # remote server. 30 | server_deploys_dir='$HOME/pycascading/deploys' 31 | 32 | # The folder on the remote server where the PyCascading master jar will be 33 | # placed. This must be given as an absolute path name so that the master 34 | # files can be found from any directory. 35 | server_build_dir='$HOME/pycascading/master' 36 | 37 | # Additional SSH options (see "man ssh"; private key, etc.) 38 | ssh_options="" 39 | 40 | # Additional Hadoop options to be put in the run.sh runner 41 | hadoop_options="" 42 | 43 | 44 | # Options over, the script begins here 45 | 46 | usage() 47 | { 48 | cat < [additional_files] 50 | 51 | The main_script gets executed by PyCascading. All additional_files are also 52 | copied to the remote server and submitted together with the job to Hadoop. 53 | 54 | Options: 55 | -h Show this message. 56 | 57 | -m Also deploy the PyCascading master archives before submitting 58 | the job. The master archives must be on the Hadoop server 59 | before a job can be run. 60 | 61 | -f Copy file to the server together with main_script, but 62 | do not bundle it up for submission. This option may be 63 | repeated several times for multiple files. File names 64 | cannot start with a dot. 65 | 66 | -s The name of the remote server where Hadoop is installed, 67 | and the PyCascading scripts should be deployed to. 68 | 69 | -o Additional options for SSH (such as private key, etc.). 70 | ssh_options is one string enclosed by "s or 's, even if 71 | there are several parameters. 72 | 73 | -O Additional Hadoop options to be put in the running script. 74 | 75 | -r Run the job immediately after submission with SSH. The 76 | recommended way to run a script is either using screen 77 | or nohup, so that the job doesn't get interrupted if the 78 | terminal connection goes down. Note that no additional 79 | command line parameters can be passed in this case for 80 | the job. 81 | 82 | EOF 83 | } 84 | 85 | 86 | # Returns the absolute path for the parameter. We cannot use either realpath 87 | # or readlink, as these may not be installed on MacOS. 88 | # Thanks to Simon Radford. 89 | realpath() 90 | { 91 | if echo "$1" | grep '^/' >/dev/null; then 92 | # Path is absolute 93 | echo "$1" 94 | else 95 | # Path is relative to the working directory 96 | echo "$(pwd)/$1" 97 | fi 98 | } 99 | 100 | 101 | # Remove the leading slashes from a path. This is needed when we package the 102 | # Python sources as tar does the same, and on extraction there are no leading 103 | # slashes. 104 | remove_leading_slash() 105 | { 106 | echo "$1" | sed 's/^\/*//' 107 | } 108 | 109 | 110 | # Copy the master jar over first? The -m option. 111 | master_first=no 112 | 113 | # Run job after submission with SSH? 114 | run_immediately='dont_run' 115 | 116 | declare -a files_to_copy 117 | 118 | while getopts ":hmf:s:o:O:r" OPTION; do 119 | case $OPTION in 120 | h) usage 121 | exit 1 122 | ;; 123 | m) master_first=yes 124 | ;; 125 | f) files_to_copy=("${files_to_copy[@]}" "$OPTARG") 126 | ;; 127 | s) server="$OPTARG" 128 | ;; 129 | o) ssh_options="$OPTARG" 130 | ;; 131 | O) hadoop_options="$OPTARG" 132 | ;; 133 | r) run_immediately='do_run' 134 | ;; 135 | esac 136 | done 137 | shift $((OPTIND-1)) 138 | 139 | main_file="$1" 140 | if [ "$main_file" == "" -a $master_first == no ]; then 141 | usage 142 | exit 3 143 | fi 144 | 145 | home_dir=$(realpath $(dirname "$0")) 146 | # This is the version that works both on Linux and MacOS 147 | tmp_dir=$(mktemp -d -t PyCascading-tmp-XXXXXX) 148 | 149 | if [ $master_first == yes ]; then 150 | build_dir="$home_dir/build" 151 | if [ -a "$build_dir/pycascading.jar" -a \ 152 | -a "$build_dir/pycascading.tgz" ]; then 153 | ln -s "$build_dir/pycascading.jar" "$build_dir/pycascading.tgz" \ 154 | "$home_dir/python/pycascading/bootstrap.py" "$tmp_dir" 155 | else 156 | echo 'Build the PyCascading master package first in the "java" folder with ant.' 157 | exit 2 158 | fi 159 | fi 160 | 161 | if [ "$main_file" != "" ]; then 162 | tar -c -z -f "$tmp_dir/sources.tgz" "$@" 163 | if [ ${#files_to_copy} -gt 0 ]; then 164 | tar -c -z -f "$tmp_dir/others.tgz" "${files_to_copy[@]}" 165 | fi 166 | fi 167 | 168 | # 169 | # Create a setup file that will be run on the deploy server after everything 170 | # is copied over. 171 | # 172 | cat >"$tmp_dir/setup.sh" <"$tmp_dir/run.sh" <