├── .gitignore
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── THIRD-PARTY-LICENSES
├── awsglue
    ├── README.md
    ├── __init__.py
    ├── context.py
    ├── data_sink.py
    ├── data_source.py
    ├── dataframe_transforms
    │   ├── __init__.py
    │   └── apply_mapping.py
    ├── dataframereader.py
    ├── dataframewriter.py
    ├── devutils.py
    ├── dynamicframe.py
    ├── functions.py
    ├── glue_shell.py
    ├── gluetypes.py
    ├── job.py
    ├── scripts
    │   ├── __init__.py
    │   ├── activate_etl_connector.py
    │   ├── connector_activation_util.py
    │   ├── crawler_redo_from_backup.py
    │   ├── crawler_undo.py
    │   └── scripts_utils.py
    ├── streaming_data_source.py
    ├── transforms
    │   ├── __init__.py
    │   ├── apply_mapping.py
    │   ├── coalesce.py
    │   ├── collection_transforms.py
    │   ├── drop_nulls.py
    │   ├── dynamicframe_filter.py
    │   ├── dynamicframe_map.py
    │   ├── errors_as_dynamicframe.py
    │   ├── field_transforms.py
    │   ├── relationalize.py
    │   ├── repartition.py
    │   ├── resolve_choice.py
    │   ├── transform.py
    │   ├── unbox.py
    │   ├── union.py
    │   └── unnest_frame.py
    └── utils.py
├── bin
    ├── glue-setup.sh
    ├── gluepyspark
    ├── gluepytest
    └── gluesparksubmit
├── pom.xml
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | PyGlue.zip
3 | conf/
4 | jars/
5 | /jarsv1/
6 | derby.log
7 | metastore_db/
8 | *.DS_Store
9 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Amazon Software License 1.0
 2 | 
 3 | This Amazon Software License ("License") governs your use, reproduction, and
 4 | distribution of the accompanying software as specified below.
 5 | 
 6 | 1. Definitions
 7 | 
 8 |   "Licensor" means any person or entity that distributes its Work.
 9 | 
10 |   "Software" means the original work of authorship made available under this
11 |   License.
12 | 
13 |   "Work" means the Software and any additions to or derivative works of the
14 |   Software that are made available under this License.
15 | 
16 |   The terms "reproduce," "reproduction," "derivative works," and
17 |   "distribution" have the meaning as provided under U.S. copyright law;
18 |   provided, however, that for the purposes of this License, derivative works
19 |   shall not include works that remain separable from, or merely link (or bind
20 |   by name) to the interfaces of, the Work.
21 | 
22 |   Works, including the Software, are "made available" under this License by
23 |   including in or with the Work either (a) a copyright notice referencing the
24 |   applicability of this License to the Work, or (b) a copy of this License.
25 | 
26 | 2. License Grants
27 | 
28 |   2.1 Copyright Grant. Subject to the terms and conditions of this License,
29 |   each Licensor grants to you a perpetual, worldwide, non-exclusive,
30 |   royalty-free, copyright license to reproduce, prepare derivative works of,
31 |   publicly display, publicly perform, sublicense and distribute its Work and
32 |   any resulting derivative works in any form.
33 | 
34 |   2.2 Patent Grant. Subject to the terms and conditions of this License, each
35 |   Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free
36 |   patent license to make, have made, use, sell, offer for sale, import, and
37 |   otherwise transfer its Work, in whole or in part. The foregoing license
38 |   applies only to the patent claims licensable by Licensor that would be
39 |   infringed by Licensor's Work (or portion thereof) individually and
40 |   excluding any combinations with any other materials or technology.
41 | 
42 | 3. Limitations
43 | 
44 |   3.1 Redistribution. You may reproduce or distribute the Work only if
45 |   (a) you do so under this License, (b) you include a complete copy of this
46 |   License with your distribution, and (c) you retain without modification
47 |   any copyright, patent, trademark, or attribution notices that are present
48 |   in the Work.
49 | 
50 |   3.2 Derivative Works. You may specify that additional or different terms
51 |   apply to the use, reproduction, and distribution of your derivative works
52 |   of the Work ("Your Terms") only if (a) Your Terms provide that the use
53 |   limitation in Section 3.3 applies to your derivative works, and (b) you
54 |   identify the specific derivative works that are subject to Your Terms.
55 |   Notwithstanding Your Terms, this License (including the redistribution
56 |   requirements in Section 3.1) will continue to apply to the Work itself.
57 | 
58 |   3.3 Use Limitation. The Work and any derivative works thereof only may be
59 |   used or intended for use with the web services, computing platforms or
60 |   applications provided by Amazon.com, Inc. or its affiliates, including
61 |   Amazon Web Services, Inc.
62 | 
63 |   3.4 Patent Claims. If you bring or threaten to bring a patent claim against
64 |   any Licensor (including any claim, cross-claim or counterclaim in a
65 |   lawsuit) to enforce any patents that you allege are infringed by any Work,
66 |   then your rights under this License from such Licensor (including the
67 |   grants in Sections 2.1 and 2.2) will terminate immediately.
68 | 
69 |   3.5 Trademarks. This License does not grant any rights to use any
70 |   Licensor's or its affiliates' names, logos, or trademarks, except as
71 |   necessary to reproduce the notices described in this License.
72 | 
73 |   3.6 Termination. If you violate any term of this License, then your rights
74 |   under this License (including the grants in Sections 2.1 and 2.2) will
75 |   terminate immediately.
76 | 
77 | 4. Disclaimer of Warranty.
78 | 
79 |   THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
80 |   EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81 |   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82 |   NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83 |   THIS LICENSE. SOME STATES' CONSUMER LAWS DO NOT ALLOW EXCLUSION OF AN
84 |   IMPLIED WARRANTY, SO THIS DISCLAIMER MAY NOT APPLY TO YOU.
85 | 
86 | 5. Limitation of Liability.
87 | 
88 |   EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
89 |   THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
90 |   SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
91 |   INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR
92 |   RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING
93 |   BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS
94 |   OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES
95 |   OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF
96 |   SUCH DAMAGES.
97 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | aws-glue-libs
2 | Copyright 2016-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | 
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # aws-glue-libs
 2 | 
 3 | This repository supports python libraries for local development of glue pyspark batch jobs. Glue streaming is supported in the separate repository [aws-glue-streaming-libs](https://github.com/awslabs/aws-glue-streaming-libs).
 4 | 
 5 | ## Contents
 6 | This repository contains:
 7 |  * `awsglue` - the Python libary you can use to author [AWS Glue](https://aws.amazon.com/glue) ETL job. This library extends [Apache Spark](https://spark.apache.org/) with additional data types and operations for ETL workflows. It's an interface for Glue ETL library in Python.
 8 |  * `bin` - this directory hosts several executables that allow you to run the Python library locally or open up a PySpark shell to run Glue Spark code interactively.
 9 | 
10 | ## Python versions by Glue Version
11 | 
12 | Different Glue versions support different Python versions. The following table below is for your reference, which also includes the associated repository's branch for each glue version.
13 | 
14 | | Glue Version  | Python 3 Version  | aws-glue-libs branch |
15 | |---|---|----------------------|
16 | | 2.0  | 3.7  | glue-2.0             |
17 | | 3.0  | 3.7  | glue-3.0             |
18 | | 4.0  | 3.10 | master               |
19 | 
20 | You may refer to AWS Glue's official [release notes](https://docs.aws.amazon.com/glue/latest/dg/release-notes.html) for more information
21 | 
22 | ## Setup guide
23 | 
24 | If you haven't already, please refer to the [official AWS Glue Python local development documentation](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-python) for the official setup documentation. The following is a summary of the AWS documentation:
25 | 
26 | The `awsglue` library provides only the Python interface to the Glue Spark runtime, you need the Glue ETL jar to run it locally. The jar is now available via the maven build system in a s3 backed maven repository. Here are the steps to set up your dev environment locally.
27 | 
28 | 1. install Apache Maven from the following location: https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-common/apache-maven-3.6.0-bin.tar.gz
29 | 1. use `copy-dependencies` target in Apache Maven to download the jar from S3 to your local dev environment.
30 | 1. download and extract the Apache Spark distribution based on the Glue version you're using:
31 |    * Glue version 2.0: `https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-2.0/spark-2.4.3-bin-hadoop2.8.tgz1`
32 |    * Glue version 3.0: `https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-3.0/spark-3.1.1-amzn-0-bin-3.2.1-amzn-3.tgz`
33 |    * Glue version 4.0: `https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-4.0/spark-3.3.0-amzn-1-bin-3.3.3-amzn-0.tgz` 
34 | 1. export the `SPARK_HOME` environmental variable to the extracted location of the above Spark distribution. For example:
35 |     ```
36 |     Glue version 2.0: export SPARK_HOME=/home/$USER/spark-2.4.3-bin-hadoop2.8
37 |     Glue version 3.0: export SPARK_HOME=/home/$USER/spark-3.1.1-amzn-0-bin-3.2.1-amzn-3
38 |     Glue version 4.0: export SPARK_HOME=/home/$USER/spark-3.3.0-amzn-1-bin-3.3.3-amzn-0
39 |     ```
40 | 1. now you can run the executables in the `bin` directory to start a Glue Shell or submit a Glue Spark application.
41 |     ```
42 |     Glue shell: ./bin/gluepyspark
43 |     Glue submit: ./bin/gluesparksubmit
44 |     pytest: ./bin/gluepytest
45 |     ```
46 | (The `gluepytest` script assumes that the pytest module is installed and available in the `PATH` env variable)
47 | 
48 | ## Licensing
49 | 
50 | The libraries in this repository licensed under the [Amazon Software License](http://aws.amazon.com/asl/) (the "License"). They may not be used except in compliance with the License, a copy of which is included here in the LICENSE file.
51 | 
52 | ---
53 | 
54 | # Release Notes
55 | 
56 | ## July 26 2023
57 | * According to [AWS Glue version support policy](https://docs.aws.amazon.com/glue/latest/dg/glue-version-support-policy.html), branches for Glue 0.9 and 1.0 are removed as they are already deprecated.
58 | 
59 | 
60 | ## August 27 2021
61 | * The master branch has been modified from representing Glue 0.9 to Glue 3.0, we have also created a glue-0.9 branch to reflect the former state of the master branch with Glue 0.9. To rename your local clone of the older master branch and point to the glue-0.9 branch, you may use the following commands:
62 | ```
63 | git branch -m master glue-0.9
64 | git fetch origin
65 | git branch -u origin/glue-0.9 glue-0.9
66 | git remote set-head origin -a
67 | ```
68 | 
69 | 


--------------------------------------------------------------------------------
/awsglue/README.md:
--------------------------------------------------------------------------------
 1 | # awsglue
 2 | 
 3 | The awsglue Python package contains the Python portion of the [AWS Glue](https://aws.amazon.com/glue) library. This library extends [PySpark](http://spark.apache.org/docs/2.1.0/api/python/pyspark.html) to support serverless ETL on AWS.
 4 | 
 5 | Note that this package must be used in conjunction with the AWS Glue service and is not executable independently. Many of the classes and methods use the Py4J library to interface with code that is available on the Glue platform. This repository can be used as a reference and aid for writing Glue scripts.
 6 | 
 7 | While scripts using this library can only be run on the AWS Glue service, it is possible to import this library locally. This may be helpful to provide auto-completion in an IDE, for instance. To import the library successfully you will need to install PySpark, which can be done using pip:
 8 | 
 9 |       pip install pyspark
10 | 
11 | ## Content
12 | 
13 | This package contains Python interfaces to the key data structures and methods used in AWS Glue. The following are some important modules. More information can be found in the public documentation.
14 | 
15 | 
16 | #### GlueContext
17 | The file [context.py](context.py) contains the GlueContext class. GlueContext extends PySpark's [SQLContext](https://github.com/apache/spark/blob/master/python/pyspark/sql/context.py) class to provide Glue-specific operations. Most Glue programs will start by instantiating a GlueContext and using it to construct a DynamicFrame. 
18 | 
19 | 
20 | #### DynamicFrame
21 | The DynamicFrame, defined in [dynamicframe.py](dynamicframe.py), is the core data structure used in Glue scripts. DynamicFrames are similar to Spark SQL's [DataFrames](https://github.com/apache/spark/blob/master/python/pyspark/sql/dataframe.py) in that they represent distributed collections of data records, but DynamicFrames provide more flexible handling of data sets with inconsistent schemas. By representing records in a self-describing way, they can be used without specifying a schema up front or requiring a costly schema inference step. 
22 | 
23 | DynamicFrames support many operations, but it is also possible to convert them to DataFrames using the `toDF` method to make use of existing Spark SQL operations. 
24 | 
25 | 
26 | #### Transforms
27 | 
28 | The [transforms](transforms/) directory contains a variety of operations that can be performed on DynamicFrames. These include simple operations, such as `DropFields`, as well as more complex transformations like `Relationalize`, which flattens a nested data set into a collection of tables that can be loaded into a Relational Database. Once imported, transforms can be invoked using the following syntax:
29 | 
30 |         TransformClass.apply(args...)
31 | 
32 | ## Additional Resources 
33 | 
34 | - The [aws-glue-samples](https://github.com/awslabs/aws-glue-samples) repository contains sample scripts that make use of awsglue library and can be submitted directly to the AWS Glue service.
35 | 
36 | - The public [Glue Documentation](http://docs.aws.amazon.com/glue/latest/dg/index.html) contains information about the AWS Glue service as well as addditional information about the Python library.
37 | 
38 | 


--------------------------------------------------------------------------------
/awsglue/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from .dynamicframe import DynamicFrame
14 | 
15 | __all__ = ['DynamicFrame']
16 | 


--------------------------------------------------------------------------------
/awsglue/data_sink.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.dynamicframe import DynamicFrame, DynamicFrameCollection
14 | from awsglue.utils import makeOptions, callsite
15 | from pyspark.sql import DataFrame
16 | 
17 | class DataSink(object):
18 |     def __init__(self, j_sink, sql_ctx):
19 |         self._jsink = j_sink
20 |         self._sql_ctx = sql_ctx
21 | 
22 |     def setFormat(self, format, **options):
23 |         self._jsink.setFormat(format, makeOptions(self._sql_ctx._sc, options))
24 | 
25 |     def setAccumulableSize(self, size):
26 |         self._jsink.setAccumulableSize(size)
27 | 
28 |     def setCatalogInfo(self, catalogDatabase, catalogTableName, catalogId = ""):
29 |         self._jsink.setCatalogInfo(catalogDatabase, catalogTableName, catalogId)
30 | 
31 |     def writeFrame(self, dynamic_frame, info = ""):
32 |         return DynamicFrame(self._jsink.pyWriteDynamicFrame(dynamic_frame._jdf, callsite(), info), dynamic_frame.glue_ctx, dynamic_frame.name + "_errors")
33 | 
34 |     def writeDataFrame(self, data_frame, glue_context, info = ""):
35 |         return DataFrame(self._jsink.pyWriteDataFrame(data_frame._jdf, glue_context._glue_scala_context, callsite(), info), self._sql_ctx)
36 | 
37 |     def write(self, dynamic_frame_or_dfc, info = ""):
38 |         if isinstance(dynamic_frame_or_dfc, DynamicFrame):
39 |             return self.writeFrame(dynamic_frame_or_dfc, info)
40 | 
41 |         elif isinstance(dynamic_frame_or_dfc, DynamicFrameCollection):
42 |             res_frames = [self.writeFrame(frame)
43 |                           for frame in dynamic_frame_or_dfc.values()]
44 |             return DynamicFrameCollection(res_frames, self._sql_ctx)
45 | 
46 |         else:
47 |             raise TypeError("dynamic_frame_or_dfc must be an instance of"
48 |                             "DynamicFrame or DynamicFrameCollection. Got "
49 |                             + str(type(dynamic_frame_or_dfc)))
50 | 


--------------------------------------------------------------------------------
/awsglue/data_source.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.dynamicframe import DynamicFrame
14 | from awsglue.utils import makeOptions, callsite
15 | 
16 | class DataSource(object):
17 |     def __init__(self, j_source, sql_ctx, name):
18 |         self._jsource = j_source
19 |         self._sql_ctx = sql_ctx
20 |         self.name = name
21 | 
22 |     def setFormat(self, format, **options):
23 |         options["callSite"] = callsite()
24 |         self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
25 | 
26 |     def getFrame(self, **options):
27 |         minPartitions = targetPartitions = None
28 | 
29 |         if 'minPartitions' in options:
30 |             minPartitions = options['minPartitions']
31 |             targetPartitions = options.get('targetPartitions', minPartitions)
32 |         elif 'targetPartitions' in options:
33 |             minPartitions = targetPartitions = options['targetPartitions']
34 | 
35 |         if minPartitions is None:
36 |             jframe = self._jsource.getDynamicFrame()
37 |         else:
38 |             jframe = self._jsource.getDynamicFrame(minPartitions, targetPartitions)
39 | 
40 |         return DynamicFrame(jframe, self._sql_ctx, self.name)
41 | 
42 |     def getSampleFrame(self, num, **options):
43 |         jframe = self._jsource.getSampleDynamicFrame(num, makeOptions(self._sql_ctx._sc, options))
44 |         return DynamicFrame(jframe, self._sql_ctx, self.name)


--------------------------------------------------------------------------------
/awsglue/dataframe_transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from .apply_mapping import ApplyMapping
14 | 
15 | ALL_TRANSFORMS = {ApplyMapping}
16 | 
17 | __all__ = [transform.__name__ for transform in ALL_TRANSFORMS]
18 | 


--------------------------------------------------------------------------------
/awsglue/dataframe_transforms/apply_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from py4j.java_gateway import java_import
14 | from pyspark.sql.dataframe import DataFrame
15 | 
16 | class ApplyMapping():
17 |     @staticmethod
18 |     def apply(frame, mappings):
19 |         jvm = frame.sql_ctx._jvm
20 | 
21 |         def _to_java_mapping(mapping_tup):
22 |             if not isinstance(mapping_tup, tuple):
23 |                 raise TypeError("Mapping must be specified as a tuple. Got " +
24 |                                 mapping_tup)
25 | 
26 |             tup2 = jvm.scala.Tuple2
27 |             tup3 = jvm.scala.Tuple3
28 |             tup4 = jvm.scala.Tuple4
29 | 
30 |             if len(mapping_tup) == 2:
31 |                 return tup2.apply(mapping_tup[0], mapping_tup[1])
32 |             elif len(mapping_tup) == 3:
33 |                 return tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2])
34 |             elif len(mapping_tup) == 4:
35 |                 return tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3])
36 |             else:
37 |                 raise ValueError("Mapping tuple must be of length 2, 3, or 4"
38 |                                  "Got tuple of length " + str(len(mapping_tup)))
39 | 
40 |         if isinstance(mappings, tuple):
41 |             mappings = [mappings]
42 | 
43 |         mappings_seq = jvm.PythonUtils.toSeq([_to_java_mapping(m) for m in mappings])
44 | 
45 |         java_import(jvm, "com.amazonaws.services.glue.dataframeTransforms.ApplyMapping")
46 | 
47 |         return DataFrame(jvm.ApplyMapping.apply(frame._jdf, mappings_seq), frame.sql_ctx)
48 | 
49 |     @classmethod
50 |     def describeArgs(cls):
51 |         arg1 = {"name": "frame",
52 |                 "type": "DataFrame",
53 |                 "description": "DataFrame to transform",
54 |                 "optional": False,
55 |                 "defaultValue": None}
56 |         arg2 = {"name": "mappings",
57 |                 "type": "DataFrame",
58 |                 "description": "List of mapping tuples (source col, source type, target col, target type)",
59 |                 "optional": False,
60 |                 "defaultValue": None}
61 | 
62 |         return [arg1, arg2]
63 | 
64 |     @classmethod
65 |     def describeTransform(cls):
66 |         return "Apply a declarative mapping to this DataFrame."
67 | 
68 |     @classmethod
69 |     def describeErrors(cls):
70 |         return []
71 | 
72 |     @classmethod
73 |     def describeReturn(cls):
74 |         return {"type": "DataFrame",
75 |                 "description": "DataFrame after applying mappings."}
76 | 
77 | 


--------------------------------------------------------------------------------
/awsglue/dataframereader.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 |  
13 | class DataFrameReader(object):
14 |     def __init__(self, glue_context):
15 |         self._glue_context = glue_context
16 | 
17 |     def from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate = "", additional_options = {}, catalog_id = None, **kwargs):
18 |         """Creates a DynamicFrame with the specified catalog name space and table name.
19 |         """
20 |         if database is not None and "name_space" in kwargs:
21 |             raise Exception("Parameter name_space and database are both specified, choose one.")
22 |         elif database is None and "name_space" not in kwargs:
23 |             raise Exception("Parameter name_space or database is missing.")
24 |         elif "name_space" in kwargs:
25 |             db = kwargs.pop("name_space")
26 |         else:
27 |             db = database
28 | 
29 |         if table_name is None:
30 |             raise Exception("Parameter table_name is missing.")
31 | 
32 |         return self._glue_context.create_data_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
33 | 
34 |     def from_options(self, connection_type, connection_options={},
35 |                      format=None, format_options={}, transformation_ctx="", push_down_predicate = "", **kwargs):
36 |         """Creates a DataFrame with the specified connection and format.
37 |         """
38 |         return self._glue_context.create_data_frame_from_options(connection_type,
39 |                                                                     connection_options,
40 |                                                                     format,
41 |                                                                     format_options, transformation_ctx, push_down_predicate, **kwargs)
42 | 


--------------------------------------------------------------------------------
/awsglue/dataframewriter.py:
--------------------------------------------------------------------------------
 1 | class DataFrameWriter(object):
 2 |     def __init__(self, glue_context):
 3 |         self._glue_context = glue_context
 4 |     def from_catalog(self, frame, database=None, table_name=None, redshift_tmp_dir="", transformation_ctx="",
 5 |                      additional_options={}, catalog_id=None, **kwargs):
 6 |         """Writes a DataFrame with the specified catalog name space and table name.
 7 |         """
 8 |         if database is not None and "name_space" in kwargs:
 9 |             raise Exception("Parameter name_space and database are both specified, choose one.")
10 |         elif database is None and "name_space" not in kwargs:
11 |             raise Exception("Parameter name_space or database is missing.")
12 |         elif "name_space" in kwargs:
13 |             db = kwargs.pop("name_space")
14 |         else:
15 |             db = database
16 | 
17 |         if table_name is None:
18 |             raise Exception("Parameter table_name is missing.")
19 | 
20 |         return self._glue_context.write_data_frame_from_catalog(frame, db, table_name, redshift_tmp_dir,
21 |                                                                    transformation_ctx, additional_options, catalog_id)
22 | 


--------------------------------------------------------------------------------
/awsglue/devutils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from __future__ import print_function
 14 | import boto3
 15 | import os
 16 | import logging
 17 | import copy
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | class ExecutionProperty:
 22 |     def __init__(self, maxConcurrentRuns=1):
 23 |         self.maxConcurrentRuns = maxConcurrentRuns
 24 | 
 25 |     def __repr__(self):
 26 |         return "{'maxConcurrentRuns': "+ str(self.maxConcurrentRuns)+ "}"
 27 | 
 28 |     def as_dict(self):
 29 |         return {'maxConcurrentRuns': self.maxConcurrentRuns}
 30 | 
 31 | 
 32 | class Command:
 33 |     def __init__(self, name, scriptLocation):
 34 |         self.name=name
 35 |         self.scriptLocation=scriptLocation
 36 | 
 37 |     def __repr__(self):
 38 |         return "{'name': '"+ str(self.name)+",' 'scriptLocation': '"+ str(self.scriptLocation)+"'}"
 39 | 
 40 |     def as_dict(self):
 41 |         return {'name': self.name, 'scriptLocation': self.scriptLocation}
 42 | 
 43 | 
 44 | class Connections:
 45 |     def __init__(self, connections=[]):
 46 |         self.connections=connections
 47 | 
 48 |     def __repr__(self):
 49 |         return "{'connections': "+str(self.connections) + "}"
 50 | 
 51 |     def as_dict(self):
 52 |         return {'connections': self.connections}
 53 | 
 54 | 
 55 | class Job:
 56 |     def __init__(self):
 57 |         self.name = ''
 58 |         self.description = ''
 59 |         self.logUri = ''
 60 |         self.role = ''
 61 |         self.executionProperty = ExecutionProperty()
 62 |         self.command = Command("glueetl", "UNKNOWN")
 63 |         self.defaultArguments = {}
 64 |         self.connections = Connections()
 65 |         self.maxRetries = 1
 66 |         self.allocatedCapacity = 1
 67 |         self.createdOn = datetime.now()
 68 |         self.lastModifiedOn = datetime.now()
 69 | 
 70 |     def __repr__(self):
 71 |         return "{'command': "+str(self.command) + ",\n" + \
 72 |             "'connections': "+str(self.connections) + ",\n" + \
 73 |             "'createdOn': "+str(self.createdOn) + ",\n" + \
 74 |             "'description': '"+str(self.description) + "',\n" + \
 75 |             "'defaultArguments': "+str(self.defaultArguments) + ",\n" + \
 76 |             "'executionProperty': "+str(self.executionProperty) + ",\n" + \
 77 |             "'lastModifiedOn': "+str(self.lastModifiedOn) + ",\n" + \
 78 |             "'logUri': '"+str(self.logUri) + "',\n" + \
 79 |             "'maxRetries': "+str(self.maxRetries) + ",\n" + \
 80 |             "'name': '"+str(self.name) + "',\n" + \
 81 |             "'role': '"+str(self.role) + "',\n" + \
 82 |             "}"
 83 | 
 84 |     def as_dict(self):
 85 |         job_dict = {}
 86 |         job_dict['command'] = self.command.as_dict()
 87 |         if len(self.connections.connections) > 0:
 88 |             job_dict['connections'] = self.connections.as_dict()
 89 |         job_dict['createdOn'] = self.createdOn
 90 |         if len(self.description) > 0:
 91 |             job_dict['description'] = self.description
 92 |         job_dict['defaultArguments'] = self.defaultArguments
 93 |         job_dict['executionProperty'] = self.executionProperty.as_dict()
 94 |         job_dict['lastModifiedOn'] = self.lastModifiedOn
 95 |         job_dict['logUri'] = self.logUri
 96 |         job_dict['maxRetries'] = self.maxRetries
 97 |         job_dict['name'] = self.name
 98 |         job_dict['role'] = self.role
 99 |         return job_dict
100 | 
101 |     def as_job_create_dict(self):
102 |         job_dict = copy.deepcopy(self.as_dict())
103 |         del job_dict['createdOn']
104 |         del job_dict['lastModifiedOn']
105 |         return job_dict
106 | 
107 |     def as_job_update_dict(self):
108 |         job_dict = copy.deepcopy(self.as_dict())
109 |         del job_dict['name']
110 |         del job_dict['createdOn']
111 |         del job_dict['lastModifiedOn']
112 |         return job_dict
113 | 
114 | 
115 | class GlueJobUtils:
116 |     def __init__(self, glue_context):
117 |         proxy_url = glue_context._jvm.AWSConnectionUtils.getGlueProxyUrl()
118 |         glue_endpoint = glue_context._jvm.AWSConnectionUtils.getGlueEndpoint()
119 |         region = glue_context._jvm.AWSConnectionUtils.getRegion()
120 |         # s3 service calls are not allowed through the proxy for the moment, so we use the s3 vpc endpoint instead
121 |         self.s3 = boto3.resource('s3')
122 |         # Boto does not have a API to set proxy information. It uses environment variables to lookup proxy informtion
123 |         if not proxy_url[8:].startswith('null'):
124 |             os.environ['https_proxy'] = proxy_url
125 |         self.glue = boto3.client('glue', endpoint_url=glue_endpoint, region_name=region)
126 | 
127 | 
128 |     def _glue_job_response_to_job(self, response_job):
129 |         job = Job()
130 |         job.name = response_job['name']
131 | 
132 |         try:
133 |             job.description = response_job['description']
134 |         except KeyError:
135 |             logging.warning('description is missing in job response for job %s' % job.name)
136 | 
137 |         try:
138 |             job.defaultArguments = response_job['defaultArguments']
139 |         except KeyError:
140 |             logging.warning('defaultArguments is missing in job response for job %s' % job.name)
141 | 
142 |         try:
143 |             job.logUri = response_job['logUri']
144 |         except KeyError:
145 |             logging.warning('logUri is missing in job response for job %s' % job.name)
146 | 
147 |         try:
148 |             job.role = response_job['role']
149 |         except KeyError:
150 |             logging.warning('role is missing in job response for job %s' % job.name)
151 | 
152 |         try:
153 |             execution_property_dict = response_job['executionProperty']
154 |             job.executionProperty = ExecutionProperty(execution_property_dict['maxConcurrentRuns'])
155 |         except KeyError:
156 |             logging.warning('executionProperty is missing in job response for job %s' % job.name)
157 | 
158 |         try:
159 |             command_dict = response_job['command']
160 |             job.command = Command(command_dict['name'], command_dict['scriptLocation'])
161 |         except KeyError:
162 |             logging.warning('command is missing in job response for job %s' % job.name)
163 | 
164 |         try:
165 |             connections_dict = response_job['connections']
166 |             job.connections = Connections(connections_dict['connections'])
167 |         except KeyError:
168 |             logging.warning('connections is missing in job response for job %s' % job.name)
169 | 
170 |         try:
171 |             job.maxRetries = response_job['maxRetries']
172 |         except KeyError:
173 |             logging.warning('maxRetries is missing in job response for job %s' % job.name)
174 | 
175 |         try:
176 |             job.createdOn = response_job['createdOn']
177 |         except KeyError:
178 |             logging.warning('createdOn is missing in job response for job %s' % job.name)
179 | 
180 |         try:
181 |             job.lastModifiedOn = response_job['lastModifiedOn']
182 |         except KeyError:
183 |             logging.warning('lastModifiedOn is missing in job response for job %s' % job.name)
184 | 
185 |         return job
186 | 
187 |     def get_jobs(self, nextToken=''):
188 |         response = self.glue.get_jobs(nextToken=nextToken)
189 |         list_jobs_response = {}
190 |         try:
191 |             list_jobs_response['NextToken'] = response['NextToken']
192 |         except KeyError:
193 |             logging.info('NextToken is not present in get_jobs response')
194 |         list_jobs_response['jobs'] = [self._glue_job_response_to_job(j) for j in response['jobs']]
195 |         return list_jobs_response
196 | 
197 |     def get_job(self, jobName):
198 |         response = self.glue.get_job(jobName=jobName)
199 |         return self._glue_job_response_to_job(response['job'])
200 | 
201 |     def _get_bucket_prefix_from_s3_url(self, s3_url):
202 |         if not s3_url.startswith('s3://'):
203 |             raise Exception('s3 url for scriptLocation should start with s3:// but given %s' % s3_url)
204 |         url_parts = s3_url[5:].split('/', 1)
205 |         if not len(url_parts) == 2:
206 |             raise Exception('s3 url for scriptLocation does not include a prefix: %s' % s3_url)
207 |         if url_parts[1].endswith('/'):
208 |             raise Exception('s3 url for scriptLocation should ot end with '/': %s' % s3_url)
209 |         return {'bucket': url_parts[0], 'prefix': url_parts[1]}
210 | 
211 |     def _upload_file_to_s3(self, s3_url, file):
212 |         if len(file) == 0:
213 |             logging.warning('script file is not specified, skipping upload of script to s3')
214 |         else:
215 |             s3_parts = self._get_bucket_prefix_from_s3_url(s3_url)
216 |             self.s3.Object(s3_parts['bucket'], s3_parts['prefix']).put(Body=open(file, 'rb'))
217 | 
218 |     def create_job(self, job, file=''):
219 |         try:
220 |             self._upload_file_to_s3(job.command.scriptLocation, file)
221 |             return self.glue.create_job(**job.as_job_create_dict())
222 |         except Exception as inst:
223 |             print(inst)
224 |             logging.error('Failed to create job')
225 | 
226 |     def update_job(self, job, file=''):
227 |         try:
228 |             self._upload_file_to_s3(job.command.scriptLocation, file)
229 |             return self.glue.update_job(jobName=job.name, jobUpdate=job.as_job_update_dict())
230 |         except Exception as inst:
231 |             print(inst)
232 |             logging.error('Failed to update job')
233 | 
234 |     def delete_job(self, jobName):
235 |         return self.glue.delete_job(jobName=jobName)
236 | 
237 | 


--------------------------------------------------------------------------------
/awsglue/dynamicframe.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from __future__ import print_function
 14 | import json
 15 | import sys
 16 | from awsglue.utils import makeOptions, callsite
 17 | from awsglue.gluetypes import _deserialize_json_string, _create_dynamic_record, _revert_to_dict, _serialize_schema
 18 | from awsglue.utils import _call_site, _as_java_list, _as_scala_option, _as_resolve_choiceOption, iteritems, itervalues
 19 | from pyspark.rdd import RDD, PipelinedRDD
 20 | from pyspark.sql.dataframe import DataFrame
 21 | from pyspark.serializers import PickleSerializer, BatchedSerializer
 22 | 
 23 | if sys.version >= "3":
 24 |     long = int
 25 |     basestring = unicode = str
 26 |     imap=map
 27 |     ifilter=filter
 28 | else:
 29 |     from itertools import imap, ifilter
 30 | 
 31 | class ResolveOption(object):
 32 |     """
 33 |     ResolveOption is used for resolve ChoiceType while converting DynamicRecord to DataFrame
 34 |     option.action includes "Project", "KeepAsStruct" and "Cast".
 35 |     """
 36 |     def __init__(self, path, action, target=None):
 37 |         """
 38 |         :param path: string, path name to ChoiceType
 39 |         :param action: string,
 40 |         :param target: spark sql Datatype
 41 |         """
 42 |         self.path = path
 43 |         self.action = action
 44 |         self.target = target
 45 | 
 46 | 
 47 | class DynamicFrame(object):
 48 | 
 49 |     def __init__(self, jdf, glue_ctx, name=""):
 50 |         self._jdf = jdf
 51 |         self.glue_ctx = glue_ctx
 52 |         self._ssql_ctx = glue_ctx._ssql_ctx
 53 |         self._sc = glue_ctx and glue_ctx._sc
 54 |         self._schema = None
 55 |         self._lazy_rdd = None
 56 |         self.name = name
 57 | 
 58 |     @property
 59 |     def _rdd(self):
 60 |         if self._lazy_rdd is None:
 61 |             jrdd = self._jdf.javaToPython()
 62 |             self._lazy_rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer()))
 63 |         return self._lazy_rdd
 64 | 
 65 |     def with_frame_schema(self, schema):
 66 |         """ Specify schema so we don't have to compute it """
 67 |         return DynamicFrame(self._jdf.pyWithFrameSchema(_serialize_schema(schema)), self.glue_ctx, self.name)
 68 | 
 69 |     def schema(self):
 70 |         if self._schema is None:
 71 |             try:
 72 |                 self._schema = _deserialize_json_string(self._jdf.schema().toString())
 73 |             except AttributeError as e:
 74 |                 raise Exception("Unable to parse datatype from schema. %s" % e)
 75 |         return self._schema
 76 | 
 77 |     def show(self, num_rows=20):
 78 |         print(self._jdf.showString(num_rows))
 79 | 
 80 |     def filter(self, f, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
 81 |         def wrap_dict_with_dynamic_records(x):
 82 |                 rec = _create_dynamic_record(x["record"])
 83 |                 try:
 84 |                     return f(rec)
 85 |                 except Exception as E:
 86 |                     if isinstance(E, KeyError) or isinstance(E, ValueError) or isinstance(E, TypeError):
 87 |                         return False
 88 |                     x['isError'] = True
 89 |                     x['errorMessage'] = str(E)
 90 |                     return True
 91 | 
 92 |         def func(iterator):
 93 |             return ifilter(wrap_dict_with_dynamic_records, iterator)
 94 |         return self.mapPartitions(func, True, transformation_ctx, info, stageThreshold, totalThreshold)
 95 | 
 96 |     def mapPartitions(self, f, preservesPartitioning=True, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
 97 |         def func(s, iterator):
 98 |             return f(iterator)
 99 |         return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
100 | 
101 |     def map(self, f, preservesPartitioning=False,transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
102 |         def wrap_dict_with_dynamic_records(x):
103 |             rec = _create_dynamic_record(x["record"])
104 |             try:
105 |                 result_record = _revert_to_dict(f(rec))
106 |                 if result_record:
107 |                     x["record"] = result_record
108 |                 else:
109 |                     x['isError'] = True
110 |                     x['errorMessage'] = "User-specified function returned None instead of DynamicRecord"
111 |                 return x
112 |             except Exception as E:
113 |                 x['isError'] = True
114 |                 x['errorMessage'] = str(E)
115 |                 return x
116 |         def func(_, iterator):
117 |             return imap(wrap_dict_with_dynamic_records, iterator)
118 |         return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
119 | 
120 |     def mapPartitionsWithIndex(self, f, preservesPartitioning=False, transformation_ctx = "", info = "", stageThreshold = 0,totalThreshold = 0):
121 |         return DynamicFrame(self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(self._jdf,
122 |             PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd, self.glue_ctx._ssql_ctx, transformation_ctx, self.name,
123 |                                             _call_site(self._sc, callsite(), info), long(stageThreshold),
124 |                                             long(totalThreshold)), self.glue_ctx, self.name)
125 | 
126 |     def printSchema(self):
127 |         print(self._jdf.schema().treeString())
128 | 
129 |     def toDF(self, options = None):
130 |         """
131 |         Please specify also target type if you choose Project and Cast action type.
132 | 
133 |         :param options: Must be list of options
134 | 
135 |         >>>toDF([ResolveOption("a.b.c", "KeepAsStruct")])
136 |         >>>toDF([ResolveOption("a.b.c", "Project", DoubleType())])
137 |         """
138 |         if options is None: options = []
139 |         scala_options = []
140 | 
141 |         for option in options:
142 |             if option.action != "KeepAsStruct" and option.target is None:
143 |                 raise Exception("Missing target type for resolve action %s." % option.action)
144 | 
145 |             scala_options.append(self.glue_ctx.convert_resolve_option(option.path, option.action, option.target))
146 | 
147 |         return DataFrame(self._jdf.toDF(self.glue_ctx._jvm.PythonUtils.toSeq(scala_options)), self.glue_ctx)
148 | 
149 |     @classmethod
150 |     def fromDF(cls, dataframe, glue_ctx, name):
151 |         """
152 |         Convert a DataFrame to a DynamicFrame by converting DynamicRecords to Rows
153 |         :param dataframe: A spark sql DataFrame
154 |         :param glue_ctx: the GlueContext object
155 |         :param name: name of the result DynamicFrame
156 |         :return: DynamicFrame
157 |         """
158 |         return DynamicFrame(glue_ctx._jvm.DynamicFrame.apply(dataframe._jdf, glue_ctx._ssql_ctx),
159 |                             glue_ctx, name)
160 | 
161 | 
162 |     def unbox(self, path, format, transformation_ctx="", info = "", stageThreshold = 0, totalThreshold = 0, **options):
163 |         """
164 |         unbox a string field
165 | 
166 |         :param path: full path to the StringNode you want to unbox
167 |         :param format: "avro" or "json"
168 |         :param info: String, any string to be associated with errors in this transformation.
169 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
170 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
171 |           for which the processing needs to error out.
172 |         :param options:
173 |             separator: String,
174 |             escaper: String,
175 |             skipFirst: Boolean,
176 |             withSchema: String, schema string should always be called by using StructType.json()
177 |             withHeader: Boolean
178 |         :return: a new DynamicFrame with unboxed DynamicRecords
179 | 
180 |         >>>unbox("a.b.c", "csv", separator="|")
181 |         """
182 |         return DynamicFrame(self._jdf.unbox(path, format, json.dumps(options), transformation_ctx,
183 |                                             _call_site(self._sc, callsite(), info), long(stageThreshold),
184 |                                             long(totalThreshold)),
185 |                             self.glue_ctx, self.name)
186 | 
187 |     def drop_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
188 |         """
189 |         :param paths: List of strings, each the full path to a node you want to drop
190 |         :param info: String, any string to be associated with errors in this transformation.
191 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
192 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
193 |           for which the processing needs to error out.
194 |         :return: DynamicFrame
195 |         """
196 |         if isinstance(paths, basestring):
197 |             paths = [paths]
198 | 
199 |         return DynamicFrame(self._jdf.dropFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
200 |                                                  _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
201 |                             self.glue_ctx, self.name)
202 | 
203 |     def select_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
204 |         """
205 |         :param paths: List of strings, each the full path to a node you want to get
206 |         :param info: String, any string to be associated with errors in this transformation.
207 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
208 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
209 |           for which the processing needs to error out.
210 |         :return: DynamicFrame
211 |         """
212 |         if isinstance(paths, basestring):
213 |             paths = [paths]
214 | 
215 |         return DynamicFrame(self._jdf.selectFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
216 |                                                    _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
217 |                             self.glue_ctx, self.name)
218 | 
219 |     def split_fields(self, paths, name1, name2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
220 |         """
221 |         :param paths: List of strings, each the full path to a node you want to split into a new DynamicFrame
222 |         :param name1: name for the dynamic frame to be split off
223 |         :param name2: name for the dynamic frame remains on original
224 |         :param info: String, any string to be associated with errors in this transformation.
225 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
226 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
227 |           for which the processing needs to error out.
228 |         :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
229 |           the second containing the nodes remaining on the original.
230 |         """
231 |         if isinstance(paths, basestring):
232 |             paths = [paths]
233 | 
234 |         jdfs = _as_java_list(self._sc, self._jdf.splitFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
235 |                                      _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)))
236 |         return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
237 | 
238 |     def split_rows(self, comparison_dict, name1, name2, transformation_ctx = "", info= "", stageThreshold = 0, totalThreshold = 0):
239 |         """
240 |         :param comparison_dict: a dictionary where the key is the path to a column, the the value is another
241 |         dictionary maping comparators to the value to which the column will be compared.
242 |         e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20 exclusive split from those
243 |         that do not meet this criteria.
244 |         :param name1: name for the dynamic frame to be split off
245 |         :param name2: name for the dynamic frame remains on original
246 |         :param info: String, any string to be associated with errors in this transformation.
247 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
248 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
249 |           for which the processing needs to error out.
250 |         :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
251 |           the second containing the nodes remaining on the original.
252 |         """
253 |         paths, values, operators = [], [], []
254 | 
255 |         for key, value in comparison_dict.items():
256 |             paths.extend([key] * len(value))
257 |             for k, v in value.items():
258 |                 operators.append(k)
259 |                 if isinstance(v, int):
260 |                     values.append(long(v))
261 |                 else:
262 |                     values.append(v)
263 | 
264 |         jdfs = _as_java_list(self._sc, self._jdf.splitRows(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
265 |                                                       self.glue_ctx._jvm.PythonUtils.toSeq(values),
266 |                                                       self.glue_ctx._jvm.PythonUtils.toSeq(operators),
267 |                                                       transformation_ctx, _call_site(self._sc, callsite(), info),
268 |                                                       long(stageThreshold), long(totalThreshold)))
269 |         return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
270 | 
271 |     def rename_field(self, oldName, newName, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
272 |         """
273 |         :param oldName: String, full path to the node you want to rename
274 |         :param newName: String, new name including full path
275 |         :param info: String, any string to be associated with errors in this transformation.
276 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
277 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
278 |           for which the processing needs to error out.
279 |         :return: DynamicFrame
280 |         """
281 |         return DynamicFrame(self._jdf.renameField(oldName, newName, transformation_ctx, _call_site(self._sc, callsite(), info),
282 |                                                   long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
283 | 
284 |     def write(self, connection_type, connection_options={},
285 |               format=None, format_options={}, accumulator_size = 0):
286 |         return self.glue_ctx.write_from_options(frame_or_dfc=self,
287 |                                                 connection_type=connection_type,
288 |                                                 connection_options=connection_options,
289 |                                                 format=format,
290 |                                                 format_options=format_options,
291 |                                                 accumulator_size=accumulator_size)
292 | 
293 |     def count(self):
294 |         return self._jdf.count()
295 | 
296 |     def spigot(self, path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
297 |         return DynamicFrame(self._jdf.spigot(path, makeOptions(self._sc, options), transformation_ctx,
298 |                                              _call_site(self._sc, callsite(), info), long(stageThreshold),
299 |                                              long(totalThreshold)),
300 |                             self.glue_ctx, self.name)
301 |             
302 |     def join(self, paths1, paths2, frame2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
303 |         if isinstance(paths1, basestring):
304 |             paths1 = [paths1]
305 |         if isinstance(paths2, basestring):
306 |             paths2 = [paths2]
307 | 
308 |         return DynamicFrame(self._jdf.join(self.glue_ctx._jvm.PythonUtils.toSeq(paths1), self.glue_ctx._jvm.PythonUtils.toSeq(paths2), frame2._jdf, transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name + frame2.name)
309 | 
310 |     def unnest(self, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
311 |         """
312 |         unnest a dynamic frame. i.e. flattens nested objects to top level elements.
313 |         It also generates joinkeys for array objects
314 |         :param info: String, any string to be associated with errors in this transformation.
315 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
316 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
317 |           for which the processing needs to error out.
318 |         :return: a new unnested dynamic frame
319 | 
320 |         >>>unnest()
321 |         """
322 |         return DynamicFrame(self._jdf.unnest(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
323 | 
324 |     def relationalize(self, root_table_name, staging_path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
325 |         """
326 |         Relationalizes a dynamic frame. i.e. produces a list of frames that are
327 |         generated by unnesting nested columns and pivoting array columns. The
328 |         pivoted array column can be joined to the root table using the joinkey
329 |         generated in unnest phase
330 |         :param root_table_name: name for the root table
331 |         :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
332 |             this path
333 |         :param options: dict of optional parameters for relationalize
334 |         :param transformation_ctx: context key to retrieve metadata about the current transformation
335 |         :param info: String, any string to be associated with errors in this transformation.
336 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
337 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
338 |           for which the processing needs to error out.
339 |         :return: DynamicFrameCollection
340 |         """
341 |         _rFrames = _as_java_list(self._sc, self._jdf.relationalize(root_table_name, staging_path,
342 |                                                               makeOptions(self._sc, options),
343 |                                                               transformation_ctx, _call_site(self._sc, callsite(), info),
344 |                                                               long(stageThreshold), long(totalThreshold)))
345 |         return DynamicFrameCollection(dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName())) for df in _rFrames), self.glue_ctx)
346 | 
347 |     def applyMapping(self, *args, **kwargs):
348 |         # In a previous version we passed args[1:] and in our tests we passed
349 |         # the DynamicFrame as the first argument. This checks for that case
350 |         # to avoid regressions.
351 |         if len(args) > 0 and isinstance(args[0], DynamicFrame):
352 |             return self.apply_mapping(*(args[1:]), **kwargs)
353 |         else:
354 |             return self.apply_mapping(*args, **kwargs)
355 | 
356 |     def apply_mapping(self, mappings, case_sensitive = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
357 |         def _to_java_mapping(mapping_tup):
358 |             if not isinstance(mapping_tup, tuple):
359 |                 raise TypeError("Mapping must be specified as a tuple. Got " +
360 |                                 mapping_tup)
361 | 
362 |             tup2 = self.glue_ctx._jvm.scala.Tuple2
363 |             tup3 = self.glue_ctx._jvm.scala.Tuple3
364 |             tup4 = self.glue_ctx._jvm.scala.Tuple4
365 |             java_cls = self.glue_ctx._jvm.MappingSpec
366 | 
367 |             if len(mapping_tup) == 2:
368 |                 return java_cls.apply(tup2.apply(mapping_tup[0], mapping_tup[1]))
369 |             elif len(mapping_tup) == 3:
370 |                 return java_cls.apply(tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2]))
371 |             elif len(mapping_tup) == 4:
372 |                 return java_cls.apply(tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3]))
373 |             else:
374 |                 raise ValueError("Mapping tuple must be of length 2, 3, or 4"
375 |                                  "Got tuple of length " + str(len(mapping_tup)))
376 | 
377 |         if isinstance(mappings, tuple):
378 |             mappings = [mappings]
379 | 
380 |         mappings_list = [ _to_java_mapping(m) for m in mappings ]
381 | 
382 |         new_jdf = self._jdf.applyMapping(
383 |             self.glue_ctx._jvm.PythonUtils.toSeq(mappings_list),
384 |             case_sensitive,
385 |             transformation_ctx,
386 |             _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
387 | 
388 |         return DynamicFrame(new_jdf, self.glue_ctx, self.name)
389 | 
390 |     def unnest_ddb_json(self, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
391 |         new_jdf = self._jdf.unnestDDBJson(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
392 |         return DynamicFrame(new_jdf, self.glue_ctx, self.name)
393 | 
394 |     def resolveChoice(self, specs=None, choice="", database=None, table_name=None,
395 |                       transformation_ctx="", info="", stageThreshold=0, totalThreshold=0, catalog_id=None):
396 |         """
397 |         :param specs: specification for choice type and corresponding resolve action,
398 |                       if the specs is empty, then tape backend would go one round of the data
399 |                       to get schema, and then based on the schema to resolve choice.
400 |         :param choice: default option when choice type path found missing from specs
401 |         :param database: Glue catalog database name, required for MATCH_CATALOG choice
402 |         :param table_name: Glue catalog table name, required for MATCH_CATALOG choice
403 |         :return: a new DynamicFrame
404 |         """
405 |         def _to_java_specs(specs_tup):
406 |             path, action = specs_tup
407 |             return self.glue_ctx._jvm.ResolveSpec.apply(path, action)
408 | 
409 |         if specs is None and not choice:
410 |             raise Exception("Parameter specs and option are both missing, add one.")
411 | 
412 |         if specs is not None and choice:
413 |             raise Exception("Parameter specs and option are both specified, choose one.")
414 | 
415 |         if specs is None:
416 |             specs = []
417 | 
418 |         if isinstance(specs, tuple):
419 |             specs = [specs]
420 | 
421 |         specs_list = [ _to_java_specs(m) for m in specs ]
422 | 
423 |         choice_option = _as_scala_option(self._sc, _as_resolve_choiceOption(self._sc, choice))
424 |         database_option = _as_scala_option(self._sc, database)
425 |         table_name_option = _as_scala_option(self._sc, table_name)
426 | 
427 |         new_jdf = self._jdf.resolveChoice(
428 |             self.glue_ctx._jvm.PythonUtils.toSeq(specs_list),
429 |             choice_option, database_option, table_name_option,
430 |             transformation_ctx,
431 |             _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold),
432 |             _as_scala_option(self._sc, catalog_id))
433 | 
434 |         return DynamicFrame(new_jdf, self.glue_ctx, self.name)
435 | 
436 |     def mergeDynamicFrame(self, stage_dynamic_frame, primary_keys, transformation_ctx = "", options = {}, info = "", stageThreshold = 0, totalThreshold = 0):
437 |         """
438 |         Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records.
439 |         Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are
440 |         retained from the source, if there is no matching record in staging frame. If staging frame has matching records
441 |         then the records from the staging frame overwrites the records in the source.
442 |         :param stage_dynamic_frame: Staging DynamicFrame
443 |         :param primary_keys: List of primary key fields to match records from source and staging dynamic frame
444 |         :param transformation_ctx: context key to retrieve metadata about the current transformation
445 |         :param options: optional options for the transformation
446 |         :param info: String, any string to be associated with errors in this transformation.
447 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
448 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
449 |           for which the processing needs to error out.
450 |         :return: DynamicFrame
451 |         """
452 |         if isinstance(primary_keys, basestring):
453 |             primary_keys = [primary_keys]
454 |         return DynamicFrame(self._jdf.mergeDynamicFrames(stage_dynamic_frame._jdf,
455 |                                                          self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys),
456 |                                                          transformation_ctx,
457 |                                                          makeOptions(self._sc, options),
458 |                                                          _call_site(self._sc, callsite(), info),
459 |                                                          long(stageThreshold),
460 |                                                          long(totalThreshold)),
461 |                             self.glue_ctx, self.name)
462 | 
463 |     def union(self, other_frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
464 |         """Returns a DynamicFrame containing all records in this frame and all records in other_frame.
465 |         :param other_frame: DynamicFrame to union with this one.
466 |         :param transformation_ctx: context key to retrieve metadata about the current transformation
467 |         :param info: String, any string to be associated with errors in this transformation.
468 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
469 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
470 |           for which the processing needs to error out.
471 |         :return: DynamicFrame
472 |         """
473 |         union = self._jdf.union(other_frame._jdf, transformation_ctx, _call_site(self._sc, callsite(), info),
474 |                                 long(stageThreshold), long(totalThreshold))
475 |         return DynamicFrame(union, self.glue_ctx, union.name)
476 | 
477 |     def getNumPartitions(self):
478 |         """Returns the number of partitions in the current DynamicFrame."""
479 |         return self._jdf.getNumPartitions()
480 | 
481 |     def repartition(self, num_partitions, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
482 |         new_jdf = self._jdf.repartition(num_partitions, transformation_ctx,
483 |                                         _call_site(self._sc, callsite(), info),
484 |                                         long(stageThreshold), long(totalThreshold))
485 |         return DynamicFrame(new_jdf, self.glue_ctx, self.name)
486 | 
487 |     def coalesce(self, num_partitions, shuffle = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
488 |         new_jdf = self._jdf.coalesce(num_partitions, shuffle, transformation_ctx,
489 |                                         _call_site(self._sc, callsite(), info),
490 |                                         long(stageThreshold), long(totalThreshold))
491 |         return DynamicFrame(new_jdf, self.glue_ctx, self.name)
492 | 
493 |     def errorsAsDynamicFrame(self):
494 |         """
495 |         Returns a DynamicFrame which has error records nested.
496 |         :return: DynamicFrame
497 |         """
498 |         return DynamicFrame(self._jdf.errorsAsDynamicFrame(), self.glue_ctx, self.name)
499 | 
500 |     def errorsCount(self):
501 |         """
502 |         Returns the total error records in a DynamicFrames
503 |         :return: Long
504 |         """
505 |         return self._jdf.errorsCount()
506 | 
507 |     def stageErrorsCount(self):
508 |         """
509 |         Returns the error generated in the transformation to this DynamicFrame
510 |         :return: Long
511 |         """
512 |         return self._jdf.stageErrorsCount()
513 | 
514 |     def assertErrorThreshold(self):
515 |         """
516 |         Asserts for the errors in the transformations which yielded this DynamicFrame
517 |         :return: Exception
518 |         """
519 |         return self._jdf.assertErrorThreshold()
520 | 
521 | 
522 | class DynamicFrameCollection(object):
523 | 
524 |     def __init__(self, dynamic_frames, glue_ctx):
525 |         """
526 |         :param df_dict: a dictionary of dynamic frame
527 |         """
528 |         self._glue_ctx = glue_ctx
529 |         if isinstance(dynamic_frames, list):
530 |             self._df_dict = { df.name: df for df in dynamic_frames }
531 |         elif isinstance(dynamic_frames, dict):
532 |             self._df_dict = dynamic_frames
533 |         else:
534 |             raise TypeError("dynamic_frames must be list or dict.")
535 | 
536 |     def __getitem__(self, key):
537 |         return self._df_dict[key]
538 | 
539 |     def __len__(self):
540 |         return len(self._df_dict)
541 | 
542 |     def keys(self):
543 |         return self._df_dict.keys()
544 | 
545 |     def values(self):
546 |         return self._df_dict.values()
547 | 
548 |     def select(self, key, transformation_ctx = ""):
549 |         """
550 |         :param key: get dynamic frame of key
551 |         :return: a dynamic frame
552 |         """
553 |         if key in self._df_dict:
554 |             return self.__getitem__(key)
555 |         else:
556 |             return DynamicFrame(self._glue_ctx._jvm.DynamicFrame.emptyDynamicFrame(self._glue_ctx._glue_scala_context), self._glue_ctx, key)
557 | 
558 |     def map(self, callable, transformation_ctx = ""):
559 |         """
560 |         :param callable: pass in a callable to every DynamicFrame
561 |         :return: a DynamicFrameCollection
562 |         """
563 |         new_dict = {}
564 |         for k,v in iteritems(self._df_dict):
565 |             res = callable(v, transformation_ctx+':'+k)
566 |             if not isinstance(res, DynamicFrame):
567 |                 raise TypeError("callable must return a DynamicFrame. "\
568 |                                 "Got {}".format(str(type(res))))
569 |             new_dict[k] = res
570 | 
571 |         return DynamicFrameCollection(new_dict, self._glue_ctx)
572 | 
573 |     def flatmap(self, f, transformation_ctx = ""):
574 |         """
575 |         :param f: A function that takes a DynamicFrame and returns a
576 |                   DynamicFrame or a DynamicFrameCollection.
577 |         :return: A DynamicFrameCollection
578 |         """
579 |         new_dict = {}
580 | 
581 |         for frame in itervalues(self._df_dict):
582 |             res = f(frame, transformation_ctx+':'+frame.name)
583 | 
584 |             if isinstance(res, DynamicFrame):
585 |                 new_dict[res.name] = res
586 |             elif isinstance(res, DynamicFrameCollection):
587 |                 new_dict.update(res)
588 |             else:
589 |                 raise TypeError("Function argument to flatmap must return "\
590 |                                 "DynamicFrame or DynamicFrameCollection."\
591 |                                 " Got {}".format(str(type(res))))
592 | 
593 |         return DynamicFrameCollection(new_dict, self._glue_ctx)
594 | 
595 | 
596 | class DynamicFrameReader(object):
597 |     def __init__(self, glue_context):
598 |         self._glue_context = glue_context
599 | 
600 |     def from_rdd(self, data, name, schema=None, sampleRatio=None):
601 |         """Creates a DynamicFrame from an RDD.
602 |         """
603 |         return self._glue_context.create_dynamic_frame_from_rdd(data, name, schema, sampleRatio)
604 | 
605 |     def from_options(self, connection_type, connection_options={},
606 |                      format=None, format_options={}, transformation_ctx="", push_down_predicate = "", **kwargs):
607 |         """Creates a DynamicFrame with the specified connection and format.
608 |         """
609 |         return self._glue_context.create_dynamic_frame_from_options(connection_type,
610 |                                                                     connection_options,
611 |                                                                     format,
612 |                                                                     format_options, transformation_ctx, push_down_predicate, **kwargs)
613 | 
614 |     def from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate = "", additional_options = {}, catalog_id = None, **kwargs):
615 |         """Creates a DynamicFrame with the specified catalog name space and table name.
616 |         """
617 |         if database is not None and "name_space" in kwargs:
618 |             raise Exception("Parameter name_space and database are both specified, choose one.")
619 |         elif database is None and "name_space" not in kwargs:
620 |             raise Exception("Parameter name_space or database is missing.")
621 |         elif "name_space" in kwargs:
622 |             db = kwargs.pop("name_space")
623 |         else:
624 |             db = database
625 | 
626 |         if table_name is None:
627 |             raise Exception("Parameter table_name is missing.")
628 | 
629 |         return self._glue_context.create_dynamic_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
630 | 
631 | 
632 | class DynamicFrameWriter(object):
633 |     def __init__(self, glue_context):
634 |         self._glue_context = glue_context
635 | 
636 |     def from_options(self, frame, connection_type, connection_options={},
637 |                        format=None, format_options={}, transformation_ctx=""):
638 |         """Creates a DynamicFrame with the specified connection and format.
639 |         """
640 |         return self._glue_context.write_dynamic_frame_from_options(frame,
641 |                                                                  connection_type,
642 |                                                                  connection_options,
643 |                                                                  format,
644 |                                                                  format_options, transformation_ctx)
645 | 
646 |     def from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
647 |         """Creates a DynamicFrame with the specified catalog name space and table name.
648 |         """
649 |         if database is not None and "name_space" in kwargs:
650 |             raise Exception("Parameter name_space and database are both specified, choose one.")
651 |         elif database is None and "name_space" not in kwargs:
652 |             raise Exception("Parameter name_space or database is missing.")
653 |         elif "name_space" in kwargs:
654 |             db = kwargs.pop("name_space")
655 |         else:
656 |             db = database
657 | 
658 |         if table_name is None:
659 |             raise Exception("Parameter table_name is missing.")
660 | 
661 |         return self._glue_context.write_dynamic_frame_from_catalog(frame, db, table_name, redshift_tmp_dir, transformation_ctx, additional_options, catalog_id)
662 | 
663 |     def from_jdbc_conf(self, frame, catalog_connection, connection_options={}, redshift_tmp_dir = "", transformation_ctx=""):
664 |         """Creates a DynamicFrame with the specified JDBC connection information.
665 |         """
666 |         return self._glue_context.write_dynamic_frame_from_jdbc_conf(frame,
667 |                                                               catalog_connection,
668 |                                                               connection_options,
669 |                                                               redshift_tmp_dir, transformation_ctx)
670 | 


--------------------------------------------------------------------------------
/awsglue/functions.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from pyspark import SparkContext
14 | from pyspark.sql.column import Column, _to_java_column, _to_seq
15 | 
16 | def replaceArrayElement(srcCol, replaceCol, idx):
17 |     sc = SparkContext._active_spark_context
18 |     jsrcCol, jreplaceCol = _to_java_column(srcCol), _to_java_column(replaceCol)
19 |     return Column(sc._jvm.gluefunctions.replaceArrayElement(jsrcCol, jreplaceCol, idx))
20 | 
21 | def namedStruct(*cols):
22 |     sc = SparkContext._active_spark_context
23 |     if len(cols) == 1 and isinstance(cols[0], (list, set)):
24 |         cols = cols[0]
25 |     jc = sc._jvm.gluefunctions.namedStruct(_to_seq(sc, cols, _to_java_column))
26 |     return Column(jc)
27 | 
28 | def explodeWithIndex(col):
29 |     sc = SparkContext._active_spark_context
30 |     jc = sc._jvm.gluefunctions.explodeWithIndex(_to_java_column(col))
31 |     return Column(jc).alias('index', 'val')


--------------------------------------------------------------------------------
/awsglue/glue_shell.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from __future__ import print_function
14 | import platform
15 | import pyspark
16 | from pyspark.context import SparkContext
17 | from pyspark.sql import SQLContext
18 | from awsglue.context import GlueContext
19 | 
20 | sc = SparkContext()
21 | # Change to GlueContext
22 | # TODO: Figure out if/how to use HiveContext
23 | glueContext = GlueContext(sc)
24 | 
25 | welcome_msg = """Welcome to
26 |     ___ _       _______    ________
27 |    /   | |     / / ___/   / ____/ /_  _____
28 |   / /| | | /| / /\\__ \\   / / __/ / / / / _ \\
29 |  / ___ | |/ |/ /___/ /  / /_/ / / /_/ /  __/
30 | /_/  |_|__/|__//____/   \____/_/\____/\___/
31 | """
32 | 
33 | print(welcome_msg)
34 | print("Using Python version %s (%s, %s)" % (
35 |     platform.python_version(),
36 |     platform.python_build()[0],
37 |     platform.python_build()[1]))
38 | print("GlueContext available as glueContext.")
39 | 


--------------------------------------------------------------------------------
/awsglue/gluetypes.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | import json
 14 | import sys
 15 | from awsglue.utils import iteritems
 16 | 
 17 | 
 18 | if sys.version >= "3":
 19 |     basestring = unicode = str
 20 | 
 21 | 
 22 | class DataType(object):
 23 |     def __init__(self, properties={}):
 24 |         self.properties = properties
 25 | 
 26 |     def __eq__(self, other):
 27 |         return (isinstance(other, self.__class__) and
 28 |                 self.__dict__ == other.__dict__)
 29 | 
 30 |     def __hash__(self):
 31 |         return hash(str(self.__class__))
 32 | 
 33 |     @classmethod
 34 |     def typeName(cls):
 35 |         return cls.__name__[:-4].lower()
 36 | 
 37 |     def jsonValue(self):
 38 |         return {"dataType": self.typeName(), "properties": self.properties}
 39 | 
 40 | 
 41 | 
 42 | # ---------------------------------------------------------------------------
 43 | # Atomic types
 44 | # ---------------------------------------------------------------------------
 45 | 
 46 | # Note we can't use singletons like Spark does because DataType instances can
 47 | # have properties.
 48 | 
 49 | 
 50 | class AtomicType(DataType):
 51 |     def __repr__(self):
 52 |         return "{}({})".format(self.__class__.__name__, self.properties)
 53 | 
 54 |     @classmethod
 55 |     def fromJsonValue(cls, json_value):
 56 |         return cls(**{k: v for k, v in iteritems(json_value)
 57 |                       if k != "dataType"})
 58 | 
 59 | 
 60 | class BinaryType(AtomicType):
 61 |     pass
 62 | 
 63 | 
 64 | class BooleanType(AtomicType):
 65 |     pass
 66 | 
 67 | 
 68 | class ByteType(AtomicType):
 69 |     pass
 70 | 
 71 | 
 72 | class DateType(AtomicType):
 73 |     pass
 74 | 
 75 | 
 76 | class DecimalType(AtomicType):
 77 |     def __init__(self, precision=10, scale=2, properties={}):
 78 |         super(DecimalType, self).__init__(properties)
 79 |         self.precision = precision
 80 |         self.scale = scale
 81 | 
 82 |     def __repr__(self):
 83 |         return "DecimalType({}, {}, {})".format(self.precision,
 84 |                                                 self.scale,
 85 |                                                 self.properties)
 86 | 
 87 |     def jsonValue(self):
 88 |         return dict(list(super(DecimalType, self).jsonValue().items()) +
 89 |                     [('precision', self.precision), ('scale', self.scale)])
 90 | 
 91 | 
 92 | class DoubleType(AtomicType):
 93 |     pass
 94 | 
 95 | 
 96 | class EnumType(AtomicType):
 97 |     def __init__(self, options, properties={}):
 98 |         super(EnumType, self).__init__(properties)
 99 |         DataType.__init__(self, properties)
100 |         self.options = options
101 | 
102 |     def __repr__(self):
103 |         options_str = ",".join(self.options[0:3])
104 |         if len(self.options) > 3:
105 |             options_str = options_str + ",..."
106 |         return "EnumType([{}], {})".format(options_str, self.properties)
107 | 
108 |     def jsonValue(self):
109 |         dict(list(super(EnumType, self).jsonValue().items()) +
110 |              [('options', list(self.options))])
111 | 
112 | 
113 | class FloatType(AtomicType):
114 |     pass
115 | 
116 | 
117 | class IntegerType(AtomicType):
118 |     @classmethod
119 |     def typeName(cls):
120 |         return "int"
121 | 
122 | 
123 | class LongType(AtomicType):
124 |     pass
125 | 
126 | 
127 | class NullType(AtomicType):
128 |     pass
129 | 
130 | 
131 | class ShortType(AtomicType):
132 |     pass
133 | 
134 | 
135 | class StringType(AtomicType):
136 |     pass
137 | 
138 | 
139 | class TimestampType(AtomicType):
140 |     pass
141 | 
142 | 
143 | class UnknownType(AtomicType):
144 |     pass
145 | 
146 | 
147 | # ---------------------------------------------------------------------------
148 | # Collection types
149 | # ---------------------------------------------------------------------------
150 | 
151 | class ArrayType(DataType):
152 | 
153 |     def __init__(self, elementType=UnknownType(), properties={}):
154 |         assert isinstance(elementType, DataType),\
155 |             "elementType should be DataType. Got" + str(elementType.__class__)
156 |         super(ArrayType, self).__init__(properties)
157 |         self.elementType = elementType
158 | 
159 |     def __repr__(self):
160 |         return "ArrayType({}, {})".format(self.elementType, self.properties)
161 | 
162 |     def jsonValue(self):
163 |         return dict(list(super(ArrayType, self).jsonValue().items()) +
164 |                     [("elementType", self.elementType.jsonValue())])
165 | 
166 |     @classmethod
167 |     def fromJsonValue(cls, json_value):
168 |         element_type = _deserialize_json_value(json_value["elementType"])
169 |         return cls(elementType=element_type,
170 |                    properties=json_value.get('properties', {}))
171 | 
172 | class SetType(DataType):
173 | 
174 |     def __init__(self, elementType=UnknownType(), properties={}):
175 |         assert isinstance(elementType, DataType), \
176 |             "elementType should be DataType. Got" + str(elementType.__class__)
177 |         super(SetType, self).__init__(properties)
178 |         self.elementType = elementType
179 | 
180 |     def __repr__(self):
181 |         return "SetType({}, {})".format(self.elementType, self.properties)
182 | 
183 |     def jsonValue(self):
184 |         return dict(list(super(SetType, self).jsonValue().items()) +
185 |                     [("elementType", self.elementType.jsonValue())])
186 | 
187 |     @classmethod
188 |     def fromJsonValue(cls, json_value):
189 |         element_type = _deserialize_json_value(json_value["elementType"])
190 |         return cls(elementType=element_type,
191 |                    properties=json_value.get('properties', {}))
192 | 
193 | 
194 | class ChoiceType(DataType):
195 | 
196 |     def __init__(self, choices=[], properties={}):
197 |         super(ChoiceType, self).__init__(properties)
198 |         self.choices = {}
199 |         for choice in choices:
200 |             self.add(choice)
201 | 
202 |     def __repr__(self):
203 |         sorted_values = sorted(self.choices.values(),
204 |                                key = lambda x: x.typeName())
205 |         choice_str = "[{}]".format(",".join([str(c) for c in sorted_values]))
206 | 
207 |         return "ChoiceType({}, {})".format(choice_str, self.properties)
208 | 
209 |     def add(self, new_choice):
210 |         if new_choice.typeName() in self.choices:
211 |             raise ValueError("Attempting to insert duplicate choice",
212 |                              new_choice)
213 |         self.choices[new_choice.typeName()] = new_choice
214 | 
215 |     def merge(self, new_choices):
216 |         if not isinstance(new_choices, list):
217 |             new_choices = [ new_choices ]
218 |         for choice in new_choices:
219 |             existing = self.choices.get(choice.typeName(), UnknownType())
220 |             self.choices[choice.typeName()] = mergeDataTypes(existing, choice)
221 | 
222 |     def jsonValue(self):
223 |         return dict(list(super(ChoiceType, self).jsonValue().items()) +
224 |                     [("choices", [v.jsonValue()
225 |                                   for v in self.choices.values()])])
226 | 
227 |     @classmethod
228 |     def fromJsonValue(cls, json_value):
229 |         choices = [_deserialize_json_value(c) for c in json_value["choices"]]
230 |         return cls(choices=choices, properties=json_value.get('properties', {}))
231 | 
232 | 
233 | class MapType(DataType):
234 | 
235 |     def __init__(self, valueType=UnknownType(), properties={}):
236 |         assert isinstance(valueType, DataType), "valueType should be DataType"
237 |         super(MapType, self).__init__(properties)
238 |         self.valueType = valueType
239 | 
240 |     def __repr__(self):
241 |         return "MapType({}, {})".format(self.valueType, self.properties)
242 | 
243 |     def jsonValue(self):
244 |         return dict(list(super(MapType, self).jsonValue().items()) +
245 |                     [("valueType", self.valueType.jsonValue())])
246 | 
247 |     @classmethod
248 |     def fromJsonValue(cls, json_value):
249 |         return cls(valueType=_deserialize_json_value(json_value["valueType"]),
250 |                    properties=json_value.get('properties', {}))
251 | 
252 | 
253 | class Field(object):
254 | 
255 |     def __init__(self, name, dataType, properties={}):
256 |         assert isinstance(dataType, DataType),\
257 |             "dataType should be DataType. Got " + str(dataType.__class__)
258 |         assert isinstance(name, basestring),\
259 |             "Field name must be a string. Got " + str(name.__class__)
260 | 
261 |         # Note this only applies in Python 2.7 if the name is type unicode. In that case
262 |         # we return a str (bytestring) encoded as utf-8. This is the same behavior as
263 |         # pyspark.sql.types.StructField. Since we are serializing as utf-8 encoded JSON,
264 |         # the correct values should be preserved when this gets mapped to Scala.
265 |         if not isinstance(name, str):
266 |             name = name.encode('utf-8')
267 |         self.name = name
268 |         self.dataType = dataType
269 |         self.properties = properties
270 | 
271 |     def __eq__(self, other):
272 |         return (self.name == other.name and
273 |                 self.dataType == other.dataType)
274 | 
275 |     def __repr__(self):
276 |         return "Field({}, {}, {})".format(self.name, self.dataType,
277 |                                           self.properties)
278 | 
279 |     def jsonValue(self):
280 |         return {"name": self.name,
281 |                 "container": self.dataType.jsonValue(),
282 |                 "properties": self.properties}
283 | 
284 |     @classmethod
285 |     def fromJsonValue(cls, json_value):
286 |         return cls(json_value["name"],
287 |                    _deserialize_json_value(json_value["container"]),
288 |                    json_value.get("properties", {}))
289 | 
290 | 
291 | class StructType(DataType):
292 | 
293 |     def __init__(self, fields=[], properties={}):
294 |         super(StructType, self).__init__(properties)
295 |         assert all(isinstance(f, Field) for f in fields),\
296 |             "fields should be a list of Field"
297 |         self.fields = fields
298 |         self.field_map = {field.name: field for field in fields}
299 | 
300 |     def __iter__(self):
301 |         return iter(self.fields)
302 | 
303 |     def __repr__(self):
304 |         return "StructType([{}], {})".format(
305 |             ",".join([str(f) for f in self.fields]), self.properties)
306 | 
307 |     def add(self, field):
308 |         assert isinstance(field, Field), "field must be of type Field"
309 |         self.fields.append(field)
310 |         self.field_map[field.name] = field
311 | 
312 |     def hasField(self, field):
313 |         if isinstance(field, Field):
314 |             field = field.name
315 |         return field in self.field_map
316 | 
317 |     def getField(self, field):
318 |         if isinstance(field, Field):
319 |             field = field.name
320 |         return self.field_map[field]
321 | 
322 |     def jsonValue(self):
323 |         return dict(list(super(StructType, self).jsonValue().items()) +
324 |                     [("fields", [f.jsonValue() for f in self.fields])])
325 | 
326 |     @classmethod
327 |     def fromJsonValue(cls, json_value):
328 |         return cls([Field.fromJsonValue(f) for f in json_value["fields"]],
329 |                    json_value.get("properties", {}))
330 | 
331 | 
332 | class EntityType(DataType):
333 |     def __init__(self, entity, base_type, properties):
334 |         raise NotImplementedError("EntityTypes not yet supported in Tape.")
335 | 
336 | 
337 | # ---------------------------------------------------------------------------
338 | # Utility methods
339 | # ---------------------------------------------------------------------------
340 | 
341 | _atomic_types = [BinaryType, BooleanType, ByteType, DateType, DecimalType,
342 |                  DoubleType, EnumType, FloatType, IntegerType, LongType, NullType,
343 |                  ShortType, StringType, TimestampType, UnknownType]
344 | 
345 | 
346 | _complex_types = [ArrayType, ChoiceType, MapType, StructType, SetType]
347 | 
348 | 
349 | _atomic_type_map = dict((t.typeName(), t) for t in _atomic_types)
350 | 
351 | 
352 | _complex_type_map = dict((t.typeName(), t) for t in _complex_types)
353 | 
354 | 
355 | _all_type_map = dict((t.typeName(), t) for t in _atomic_types + _complex_types)
356 | 
357 | 
358 | def _deserialize_json_string(json_str):
359 |     return _deserialize_json_value(json.loads(json_str))
360 | 
361 | 
362 | def _deserialize_json_value(json_val):
363 |     assert isinstance(json_val, dict), "Json value must be dictionary"
364 |     data_type = json_val["dataType"]
365 |     return _all_type_map[data_type].fromJsonValue(json_val)
366 | 
367 | def _serialize_schema(schema):
368 |     return json.dumps(schema.jsonValue())
369 | 
370 | def _make_choice(s1, s2):
371 |     if isinstance(s1, ChoiceType):
372 |         left_types = s1.choices
373 |     else:
374 |         left_types = {s1.typeName(): s1}
375 | 
376 |     if isinstance(s2, ChoiceType):
377 |         right_types = s2.choices
378 |     else:
379 |         right_types = {s2.typeName(): s2}
380 | 
381 |     for typecode, datatype in iteritems(left_types):
382 |         if typecode in right_types:
383 |             right_types[typecode] = mergeDataTypes(datatype,
384 |                                                    right_types[typecode])
385 |         else:
386 |             right_types[typecode] = datatype
387 | 
388 |     return ChoiceType(right_types.values(), s1.properties)
389 | 
390 | 
391 | # Simple Python merge implementation. This is less efficient than the Scala
392 | # version and should be used primarily for interactive manipulation.
393 | # Has similar limitations to the Scala version -- does not merge properties,
394 | # for instance.
395 | def mergeDataTypes(s1, s2):
396 |     if isinstance(s1, UnknownType) or isinstance(s1, NullType):
397 |         return s2
398 |     elif isinstance(s2, UnknownType) or isinstance(s2, NullType):
399 |         return s1
400 |     elif isinstance(s1, ChoiceType) or isinstance(s2, ChoiceType):
401 |         return _make_choice(s1, s2)
402 |     elif type(s1) != type(s2):
403 |         return _make_choice(s1, s2)
404 |     else:
405 |         if isinstance(s1, StructType):
406 |             new_fields = []
407 |             # Fields that are present in both s1 and s2.
408 |             for field in s1:
409 |                 if s2.hasField(field):
410 |                     new_fields.append(
411 |                         Field(field.name,
412 |                               mergeDataTypes(field.dataType,
413 |                                               s2.getField(field).dataType),
414 |                               field.properties))
415 |                 else:
416 |                     # Fields in s1 that are not in s2.
417 |                     new_fields.append(Field(field.name, field.dataType,
418 |                                             field.properties))
419 | 
420 |             # Fields in s2 that are not in s1.
421 |             new_fields.extend([Field(field.name, field.dataType,
422 |                                      field.properties)
423 |                                for field in s2 if not s1.hasField(field)])
424 |             return StructType(new_fields, s1.properties)
425 |         elif isinstance(s1, ArrayType):
426 |             return ArrayType(mergeDataTypes(s1.elementType, s2.elementType))
427 |         elif isinstance(s1, MapType):
428 |             return MapType(mergeDataTypes(s1.valueType, s2.valueType))
429 |         elif isinstance(s1, EnumType):
430 |             return EnumType(s1.options + s2.options)
431 |         else:
432 |             return s1
433 | 
434 | 
435 | def _create_dynamic_record(dynamicRecord):
436 |     vals = dict()
437 |     for k, v in dynamicRecord.items():
438 |         val = v
439 |         if type(v) == dict:
440 |             val = DynamicRecord(v)
441 |         vals[k] = val
442 |     return DynamicRecord(vals)
443 | 
444 | 
445 | def _revert_to_dict(dynamicRecord):
446 |     if isinstance(dynamicRecord, dict):
447 |         return {k: _revert_to_dict(v) for k,v in iteritems(dynamicRecord)}
448 |     elif isinstance(dynamicRecord, list):
449 |         return [_revert_to_dict(v) for v in dynamicRecord]
450 |     else:
451 |         return dynamicRecord
452 | 
453 | class DynamicRecord(dict):
454 |     def __getattr__(self, attr):
455 |         return self[attr]
456 | 
457 |     def __setattr__(self, attr, value):
458 |         self[attr] = value
459 | 


--------------------------------------------------------------------------------
/awsglue/job.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | class Job:
14 |     @classmethod
15 |     def continuation_options(cls):
16 |         return [ '--continuation-option', 'continuation-enabled', 'continuation-readonly', 'continuation-ignore' ]
17 | 
18 |     @classmethod
19 |     def job_bookmark_options(cls):
20 |         return [ '--job-bookmark-option', 'job-bookmark-enable', 'job-bookmark-pause', 'job-bookmark-disable' ]
21 |     @classmethod
22 |     def job_bookmark_range_options(cls):
23 |         return [ '--job-bookmark-from', '--job-bookmark-to' ]
24 | 
25 |     @classmethod
26 |     def id_params(cls):
27 |         return [ '--JOB_NAME', '--JOB_ID', '--JOB_RUN_ID', '--SECURITY_CONFIGURATION' ]
28 | 
29 |     @classmethod
30 |     def encryption_type_options(cls):
31 |         return [ '--encryption-type' , 'sse-s3' ]
32 | 
33 |     @classmethod
34 |     def data_lineage_options(cls):
35 |         return [ '--enable-data-lineage']
36 |     
37 |     def __init__(self, glue_context):
38 |         self._job = glue_context._jvm.Job
39 |         self._glue_context = glue_context
40 | 
41 |     def init(self, job_name, args = {}):
42 |         self._job.init(job_name, self._glue_context._glue_scala_context, args)
43 | 
44 |     def isInitialized(self):
45 |         return self._job.isInitialized()
46 | 
47 |     def commit(self):
48 |         self._job.commit()
49 | 
50 | 


--------------------------------------------------------------------------------
/awsglue/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | 


--------------------------------------------------------------------------------
/awsglue/scripts/activate_etl_connector.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | """
 13 |    This script is supposed to be invoked by the tape-run.sh or PrepareLaunch class within the Tape container. It iterates
 14 |    the connections supplied to extract the ECR URL. Using the URL, the docker image will be downloaded in a per-layer 
 15 |    fashion and unpacked onto to the container file system. Finally, the paths to the connector jars are written out to an
 16 |    output file. Reference: https://rmannibucau.metawerx.net/post/docker-extracts-fileystem-with-bash
 17 | """
 18 | 
 19 | import argparse
 20 | import gzip
 21 | import logging
 22 | import os
 23 | import random
 24 | import re
 25 | import shutil
 26 | import string
 27 | import subprocess
 28 | import sys
 29 | from typing import Any, Dict, List, Tuple, Union
 30 | from urllib.parse import urlparse
 31 | from os import path
 32 | 
 33 | import boto3
 34 | import requests
 35 | from botocore.config import Config
 36 | from botocore.exceptions import ClientError, NoCredentialsError
 37 | from .connector_activation_util import boto_client_error
 38 | 
 39 | LAYER_TAR_DIR = "layers/tar"
 40 | LAYER_GZ_DIR = "layers/gz"
 41 | MARKETPLACE = "MARKETPLACE"
 42 | CUSTOM = "CUSTOM"
 43 | HTTP_PROXY = "HTTP_PROXY"
 44 | HTTPS_PROXY = "HTTPS_PROXY"
 45 | NO_PROXY = "NO_PROXY"
 46 | 
 47 | logger = logging.getLogger(__name__)
 48 | logger.setLevel(logging.INFO)
 49 | 
 50 | 
 51 | def add_stream_handler() -> None:
 52 |     """
 53 |     Add a new stream handler to the logger at module level to emit LogRecord to std.out. With this setup, logs will show
 54 |     up in both customer's logStream and our docker logStream to aid debugging.
 55 |     """
 56 |     stream_handler = logging.StreamHandler(stream=sys.stdout)
 57 |     formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - Glue ETL Marketplace - %(message)s")
 58 |     stream_handler.setFormatter(formatter)
 59 |     stream_handler.setLevel(logging.INFO)
 60 |     logger.addHandler(stream_handler)
 61 | 
 62 | 
 63 | def run_commands(commands: List[str]) -> Tuple[bytes, bytes]:
 64 |     """
 65 |     Util function to run shell commands from Python.
 66 |     """
 67 |     process = subprocess.Popen(commands,
 68 |                                stdout=subprocess.PIPE,
 69 |                                stderr=subprocess.PIPE)
 70 |     stdout, stderr = process.communicate()
 71 |     logger.info(f"run_commands output - \"{' '.join(commands)}\"\n"
 72 |                 f"stdout: {stdout.decode()}\n"
 73 |                 f"stderr: {stderr.decode()}")
 74 |     return stdout, stderr
 75 | 
 76 | 
 77 | def send_get_request(url: str, header: Dict[str, str]) -> requests.Response:
 78 |     logger.debug(f"Sending GET request to {url} with {header.keys()} specified in header.")
 79 |     response = requests.get(url, headers=header)
 80 |     response.raise_for_status()
 81 |     return response
 82 | 
 83 | 
 84 | def parse_url(url: str) -> Tuple[str, str]:
 85 |     res = urlparse(url, allow_fragments=False)
 86 |     return res.netloc, res.path.strip("/")
 87 | 
 88 | 
 89 | def extract_ecr_region(ecr_root: str) -> Union[None, str]:
 90 |     """
 91 |     Extract AWS Region of the ECR registry from its root address
 92 |     e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
 93 |     """
 94 |     session = boto3.session.Session()
 95 |     for region in session.get_available_regions("ecr"):
 96 |         if region in ecr_root:
 97 |             return region
 98 | 
 99 | 
100 | def extract_registry_id(ecr_root: str) -> str:
101 |     """
102 |     Extract AWS account id of the ECR registry from its root address
103 |     e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
104 |     """
105 |     match = re.match(r"^(\d{12})\.dkr\.ecr\.[a-z]{2}-[a-z]{4}-\d\.amazonaws\.com$", ecr_root)
106 |     if match:
107 |         return match.group(1)
108 |     else:
109 |         raise ValueError(f"Invalid ECR url supplied, couldn't find aws account from {ecr_root}.")
110 | 
111 | 
112 | @boto_client_error(logger)
113 | def get_ecr_authorization_token(ecr_root: str) -> str:
114 |     """
115 |     Get the ECR authorization token to be used later to call ECR HTTP API. Even though not clearly documented, the
116 |     region is actually required to get the correct token, otherwise ECR returns Code 400 when the wrong token is used.
117 |     """
118 |     region = extract_ecr_region(ecr_root)
119 |     registry_id = extract_registry_id(ecr_root)
120 |     ecr = boto3.client(service_name="ecr", region_name=region)
121 |     logger.info(f"Requesting ECR authorization token for registryIds={registry_id} and region_name={region}.")
122 |     response = ecr.get_authorization_token(registryIds=[registry_id])
123 |     return response["authorizationData"][0]["authorizationToken"]
124 | 
125 | 
126 | def parse_ecr_url(ecr_url: str) -> Tuple[str, str, str]:
127 |     """
128 |     Parse ECR root address, image name and tag from the given ECR URL.
129 |     E.g. https://xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/salesforce:7.2.0-latest
130 |     """
131 |     ecr_root, repo = parse_url(ecr_url)
132 |     if not re.match("^\d{12}\.dkr\.ecr\.[a-z]{2}-[a-z]{4}-\d\.amazonaws\.com$", ecr_root):
133 |         raise ValueError("malformed registry, correct pattern is https://aws_account_id.dkr.ecr.region.amazonaws.com")
134 |     if not re.match("^[^:]+:[^:]+$", repo):
135 |         raise ValueError("malformed image name, only one colon allowed to delimit image name and tag")
136 |     image_name, tag = repo.split(":")
137 |     return ecr_root, image_name, tag
138 | 
139 | 
140 | def get_docker_manifest(ecr_url: str, header: Dict[str, str]) -> Dict[str, Any]:
141 |     """
142 |     Returns the manifest for the given image in ECR. It includes information about an image such as layers, size and
143 |     digest. We extract the layers to get the digest id to download archive file for each layer.
144 |     """
145 |     ecr_root, image_name, tag = parse_ecr_url(ecr_url)
146 |     manifest_url = f"https://{ecr_root}/v2/{image_name}/manifests/{tag}"
147 |     logger.info(f"Calling ECR HTTP API to get manifest of {ecr_url}.")
148 |     manifest = send_get_request(manifest_url, header).json()
149 |     return manifest
150 | 
151 | 
152 | def download_and_unpack_docker_layer(ecr_url: str, digest: str, dir_prefix: str, header: Dict[str, str]) -> None:
153 |     """
154 |     Docker cli and the daemon process are both not available within Glue Python Shell runtime. In order to download
155 |     docker image and extract the connector jars inside, we need to download the layers that consist the image and unpack
156 |     the file system so that we can access the jar files. The layer itself has multiple levels of compression applied,
157 |     which is why we need to download it as gz file and then unpack as tar file. The final unpack of the tar file is done
158 |     via the 'tar' command line tool because the tarfile library doesn't work for permission issue.
159 |     """
160 |     logger.info(f"Download/unpacking {digest} layer of image: {ecr_url}.")
161 |     layer_id = digest.split(":")[1]
162 |     logger.info(f"Preparing layer url and gz file path to store layer {layer_id}.")
163 |     layer_gz_path = f"{dir_prefix}/{LAYER_GZ_DIR}/{layer_id}.gz"
164 |     ecr_root, image_name, tag = parse_ecr_url(ecr_url)
165 |     layer_url = f"https://{ecr_root}/v2/{image_name}/blobs/{digest}"
166 | 
167 |     logger.info(f"Getting the layer file {layer_id} and store it as gz.")
168 |     layer = send_get_request(layer_url, header)
169 |     with open(layer_gz_path, "wb") as f:
170 |         f.write(layer.content)
171 | 
172 |     logger.info(f"Unzipping the {layer_id} layer and store as tar file.")
173 |     with gzip.open(f"{layer_gz_path}", "rb") as f_in:
174 |         with open(f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}", "wb") as f_out:
175 |             shutil.copyfileobj(f_in, f_out)
176 | 
177 |     logger.info(f"Unarchiving {layer_id} layer as tar file.")
178 |     run_commands(["tar", "-C", f"{dir_prefix}/{LAYER_TAR_DIR}/", "-xf", f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}"])
179 | 
180 | 
181 | def parse_args(args: List[str]) -> List[str]:
182 |     arg_parser = argparse.ArgumentParser()
183 |     arg_parser.add_argument("--connections",
184 |                             required=True,
185 |                             type=lambda x: x.split(","),
186 |                             help="a list of connection names we'll use to download jars for")
187 |     arg_parser.add_argument("--result_path",
188 |                             required=True,
189 |                             help="file path to store the jar downloading result")
190 |     arg_parser.add_argument("--region",
191 |                             required=True,
192 |                             help="aws region of the connections supplied")
193 |     arg_parser.add_argument("--endpoint",
194 |                             required=True,
195 |                             help="endpoint to use to talk with Glue service")
196 |     arg_parser.add_argument("--proxy",
197 |                             default=None,
198 |                             help="proxy to talk to Glue backend in case of VPC job")
199 |     parsed_args = arg_parser.parse_args(args)
200 |     return [parsed_args.connections, parsed_args.result_path, parsed_args.region,
201 |             parsed_args.endpoint, parsed_args.proxy]
202 | 
203 | 
204 | def id_generator(size: int = 5, chars: str = string.ascii_uppercase + string.digits) -> str:
205 |     """
206 |     Generate a random Id using letters from "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" with {size} digits.
207 |     """
208 |     return ''.join(random.choice(chars) for _ in range(size))
209 | 
210 | 
211 | def get_connection(region: str, endpoint: str, conn: str, proxy: str = None) -> Union[Dict, None]:
212 |     """
213 |     Get catalog connection metadata by calling Boto3 get_connection API, supports custom supplied region and endpoint.
214 |     """
215 |     config = Config()
216 |     if proxy:
217 |         config.proxies = {'https': proxy}
218 |     glue = boto3.Session().client(
219 |         service_name="glue",
220 |         region_name=region,
221 |         endpoint_url=endpoint,
222 |         config=config
223 |     )
224 |     logger.info(f"using region: {region}, proxy: {proxy} and glue endpoint: {endpoint} to get connection: {conn}")
225 |     try:
226 |         return glue.get_connection(Name=conn)
227 |     except ClientError:
228 |         logger.exception(f"Failed to get connection detail for {conn}, skip jar downloading for it")
229 |     except NoCredentialsError:
230 |         logger.exception(f"Unable to get credential to call GetConnection for {conn}, skip jar downloading for it."
231 |                          f" Check if the IAM role has the right permission or if you need to increase IMDS retry.")
232 |     return None
233 | 
234 | 
235 | def collect_files_by_suffix(input_dir: str, suffix: str) -> List[str]:
236 |     """
237 |     Given an input path to a directory, find all files ending with the input suffix. Return a list of absolute paths of
238 |     these files.
239 |     """
240 |     res = []
241 |     for dirpath, _, filenames in os.walk(input_dir):
242 |         for file in filenames:
243 |             if not file.endswith(suffix):
244 |                 continue
245 |             else:
246 |                 abs_path = os.path.abspath(os.path.join(dirpath, file))
247 |                 res.append(abs_path)
248 |     return res
249 | 
250 | 
251 | @boto_client_error(logger, "Failed to download jars for custom connection from S3...")
252 | def download_custom_jars(conn: Dict[str, Any], dest_folder: str = "/tmp/custom_connection_jars"):
253 |     os.makedirs(dest_folder, exist_ok=True)
254 |     s3_urls: List[str] = conn["Connection"]["ConnectionProperties"]["CONNECTOR_URL"].split(",")
255 |     s3 = boto3.client("s3")
256 |     res = []
257 | 
258 |     for url in s3_urls:
259 |         if url.strip().startswith("s3://") and url.strip().endswith(".jar"):
260 |             bucket, key = parse_url(url.strip())
261 |             file_path = f"{dest_folder}/etl-{key.split('/')[-1]}"
262 |             s3.download_file(bucket, key, file_path)
263 |             res.append(file_path)
264 |         else:
265 |             logger.error("custom connection can only have S3 urls end with '.jar' as connector url.")
266 |     logger.info(f"collected jar paths: {res} for connection: {conn}.")
267 |     return res
268 | 
269 | 
270 | def download_jars_per_connection(conn: str, region: str, endpoint: str, proxy: str = None) -> List[str]:
271 |     # validate connection type
272 |     connection = get_connection(region, endpoint, conn, proxy)
273 |     if connection is None:
274 |         return []
275 |     # download jars from S3 in case of custom connection
276 |     elif connection["Connection"]["ConnectionType"] == CUSTOM:
277 |         logger.info(f"Connection {conn} is a Custom connection, try to download jars for it from S3.")
278 |         return download_custom_jars(connection)
279 |     # return empty list in case of non-marketplace connection
280 |     elif connection["Connection"]["ConnectionType"] != MARKETPLACE:
281 |         logger.warning(f"Connection {conn} is not a Marketplace connection, skip jar downloading for it")
282 |         return []
283 |         
284 |     # get the connection classname
285 |     if "CONNECTOR_CLASS_NAME" in connection["Connection"]["ConnectionProperties"]:
286 |         driver_name = connection["Connection"]["ConnectionProperties"]["CONNECTOR_CLASS_NAME"]
287 | 
288 |     # get the the connection ecr url
289 |     ecr_url = connection["Connection"]["ConnectionProperties"]["CONNECTOR_URL"]
290 |     ecr_root, _, _ = parse_ecr_url(ecr_url)
291 | 
292 |     # download the jars
293 |     token = get_ecr_authorization_token(ecr_root)
294 |     http_header = {"Authorization": f"Basic {token}"}
295 | 
296 |     manifest = get_docker_manifest(ecr_url, http_header)
297 | 
298 |     # make directory for the jars of the given connection
299 |     dir_prefix = id_generator()
300 |     os.makedirs(f"{dir_prefix}/{LAYER_TAR_DIR}", exist_ok=True)
301 |     os.makedirs(f"{dir_prefix}/{LAYER_GZ_DIR}", exist_ok=True)
302 | 
303 |     for layer in manifest["layers"]:
304 |         download_and_unpack_docker_layer(ecr_url, layer["digest"], dir_prefix, http_header)
305 | 
306 |     # return the jar paths
307 |     res = collect_files_by_suffix(f"{dir_prefix}/{LAYER_TAR_DIR}/jars", ".jar")
308 |     logger.info(f"Container paths are: {res}")
309 | 
310 |     # Write OEM key to /tmp/glue-marketplace.conf
311 |     oem_key_path = f"{dir_prefix}/{LAYER_TAR_DIR}/oem/oem.txt"
312 |     if path.exists(oem_key_path):
313 |         with open(oem_key_path, 'r') as oem_file:
314 |             oem_key = oem_file.readline()
315 |             oem_value = oem_file.readline()
316 |         output = """marketplace_oem = {
317 |         %s = {
318 |             oem_key = %s        oem_value = %s
319 |           }
320 |         }\n""" % (driver_name, oem_key, oem_value)
321 |         with open("/tmp/glue-marketplace.conf", 'a') as opened_file:
322 |             opened_file.write(output)
323 |         logger.info(f"OEM information is written.")
324 | 
325 |     if not res:
326 |         logger.warning(f"found no connector jars from {ecr_url} provided by {conn}, please contact AWS support of"
327 |                        f" the Connector product owner to debug the issue.")
328 |     else:
329 |         logger.info(f"collected jar paths: {res} for connection: {conn}")
330 |     return res
331 | 
332 | 
333 | def main():
334 |     # in case of VPC, we directly update config with proxy for glue client. Hence here we unset the environmental values
335 |     # to avoid clients for other AWS services to go through Glue's proxy. The unset is process local and will not affect
336 |     # subsequent aws cli usage.
337 |     if HTTP_PROXY in os.environ:
338 |         del os.environ[HTTP_PROXY]
339 |     if HTTPS_PROXY in os.environ:
340 |         del os.environ[HTTPS_PROXY]
341 |     if NO_PROXY in os.environ:
342 |         del os.environ[NO_PROXY]
343 | 
344 |     connections, result_path, region, endpoint, proxy = parse_args(sys.argv[1:])
345 |     add_stream_handler()
346 | 
347 |     res = []
348 |     for conn in connections:
349 |         logger.info(f"Start downloading connector jars for connection: {conn}")
350 |         res += download_jars_per_connection(conn, region, endpoint, proxy)
351 | 
352 |     # concatenate the jar paths as a string and write it out to result_path
353 |     with open(result_path, "w") as f:
354 |         f.write(",".join(res))
355 | 
356 |     logger.info(f"successfully wrote jar paths to \"{result_path}\"")
357 | 
358 | 
359 | if __name__ == "__main__":
360 |     main()
361 | 


--------------------------------------------------------------------------------
/awsglue/scripts/connector_activation_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from logging import Logger
14 | 
15 | from botocore.exceptions import ClientError, NoCredentialsError
16 | 
17 | 
18 | def boto_client_error(logger: Logger, message: str = ""):
19 |     def decorator(func):
20 |         def wrapper(*args, **kwargs):
21 |             try:
22 |                 return func(*args, **kwargs)
23 |             except ClientError as error:
24 |                 if error.response['Error']['Code'] == 'InternalError':  # Generic error
25 |                     # We grab the message, request ID, and HTTP code to give to customer support
26 |                     logger.error('Error Message: {}'.format(error.response['Error']['Message']))
27 |                     logger.error('Request ID: {}'.format(error.response['ResponseMetadata']['RequestId']))
28 |                     logger.error('Http code: {}'.format(error.response['ResponseMetadata']['HTTPStatusCode']))
29 |                 else:
30 |                     logger.error(f"boto3 clientError raised in function {func.__name__}" + repr(error) + message)
31 |                 raise
32 |             except NoCredentialsError as error:
33 |                 logger.error(f"boto3 NoCredentialsError raised in function {func.__name__}: {repr(error)}"
34 |                                  f"Check if the IAM role has the right permission or if you need to increase IMDS retry.")
35 |                 raise
36 | 
37 |         return wrapper
38 |     return decorator
39 | 


--------------------------------------------------------------------------------
/awsglue/scripts/crawler_redo_from_backup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from __future__ import print_function
14 | 
15 | import sys
16 | import argparse
17 | from awsglue.context import GlueContext
18 | from pyspark.context import SparkContext
19 | from awsglue.dynamicframe import DynamicFrame
20 | from awsglue.transforms import get_transform
21 | from pyspark.sql.types import *
22 | from .scripts_utils import *
23 | from pyspark.sql.functions import *
24 | 
25 | def crawler_redo_from_backup(glue_context, **options):
26 |     spark_ctxt = glue_context._instantiatedContext
27 |     backup_location = options['s3.backup_location']
28 | 
29 |     # Read from s3
30 |     data = read_from_s3(glue_context, backup_location)
31 | 
32 |     # Write to Catalog
33 |     for entity_type in ['table', 'tableToDelete', 'partition', 'partitionToDelete']:
34 |         write_df_to_catalog(data[entity_type], entity_type, glue_context, options)
35 | 
36 | def crawler_redo_from_backup_options(args):
37 |     # arguments
38 |     parser = argparse.ArgumentParser(description='This script allows you to restore a namespace to a specific backup.')
39 |     parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to restore.')
40 |     parser.add_argument('-b', '--backup-location', required=True, help='Location of the backup to use.')
41 |     parser.add_argument('-d', '--database-name', required=False, help='Database to back up. If not specified, '
42 |                                                                      'the database target of the crawler is used instead.')
43 |     parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional service endpoint region.')
44 | 
45 | 
46 |     options, unknown = parser.parse_known_args(args)
47 | 
48 |     if options.database_name is not None:
49 |         database_name = options.database_name
50 |     else:
51 |         import boto3
52 |         glue_endpoint = DEFAULT_GLUE_ENDPOINT
53 |         glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
54 |         crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
55 |         database_name = crawler['DatabaseName']
56 | 
57 |     return {
58 |         "catalog.name": DEFAULT_CATALOG_ENDPOINT,
59 |         "catalog.region": options.region,
60 |         "catalog.database": database_name,
61 |         "crawler.name" : options.crawler_name,
62 |         "s3.backup_location" : options.backup_location
63 |     }
64 | 
65 | def main():
66 | 
67 |     # spark env
68 |     sc = SparkContext()
69 |     glue_context = GlueContext(sc)
70 | 
71 |     crawler_redo_from_backup(
72 |         glue_context,
73 |         **crawler_redo_from_backup_options(sys.argv[1:]))
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/awsglue/scripts/crawler_undo.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from __future__ import print_function
 14 | 
 15 | import sys
 16 | import argparse
 17 | from awsglue.context import GlueContext
 18 | from pyspark.context import SparkContext
 19 | 
 20 | from awsglue.dynamicframe import DynamicFrame
 21 | from awsglue.transforms import get_transform
 22 | from pyspark.sql.types import *
 23 | from .scripts_utils import *
 24 | from pyspark.sql.functions import *
 25 | 
 26 | def crawler_backup(glue_context, data, options):
 27 |     crawler_name = options['crawler.name']
 28 |     backup_location = options['s3.backup_location']
 29 |     database_name = options['catalog.database']
 30 | 
 31 |     # Only get data for this crawler
 32 |     data['table'] = data['table'].filter("parameters.UPDATED_BY_CRAWLER = '%s'" % crawler_name)
 33 |     data['partition'] = data['partition'].join(data['table'].withColumn('tableName', col('name')), 'tableName', 'leftsemi')
 34 | 
 35 |     if backup_location is not None:
 36 |         # Backup the contents of the catalog at an s3 location
 37 |         write_backup(data, database_name, backup_location, glue_context)
 38 | 
 39 | def crawler_undo(glue_context, **options):
 40 |     spark_ctxt = glue_context._instantiatedContext
 41 |     crawler_name = options['crawler.name']
 42 |     database_name = options['catalog.database']
 43 |     timestamp = options['timestamp']
 44 |     options["catalog.tableVersions"] = True
 45 |     
 46 |     data = read_from_catalog(glue_context, options)
 47 | 
 48 |     crawler_backup(glue_context, data, options)
 49 | 
 50 |     # Find all the table versions for this crawler
 51 |     crawler_tables = data['tableVersion'].select(col("table.updateTime").alias("updateTime"), col("table"), col('table.parameters.UPDATED_BY_CRAWLER')).filter("UPDATED_BY_CRAWLER = '%s'" % crawler_name)
 52 |     
 53 |     # Find the latest previous version of tables for this crawler that were updated or deleted since the last timestamp.
 54 |     filtered = crawler_tables.filter("updateTime <= %d" % timestamp).withColumn("filtered_name", col("table.name"))
 55 |     update_times = filtered.groupBy("table.name").max("table.updateTime").withColumnRenamed("max(table.updateTime AS `updateTime`)","time") 
 56 |     joined = filtered.join(update_times, (col("filtered_name") == col("name")) & (col("updateTime") == col("time")), 'inner')
 57 |     tables_to_write = joined.select(col("table.*"))
 58 |     
 59 |     # Find the tables that were created since the last timestamp
 60 |     names = crawler_tables.select(col("table.name")).distinct()
 61 |     present_before_timestamp = joined.select(col("table.name"))
 62 |     tables_to_delete = names.subtract(present_before_timestamp)
 63 | 
 64 |     # Find the partitions that were created since the last timestamp
 65 |     partitions_to_delete = data['partition'].withColumn('name', col('tableName')).join(crawler_tables.withColumn('name', col('table.name')), 'name', 'leftsemi').filter("creationTime < %d" % timestamp)
 66 | 
 67 |     # Write to Catalog
 68 |     write_df_to_catalog(tables_to_write, "table", glue_context, options)
 69 |     write_df_to_catalog(tables_to_delete, "tableToDelete", glue_context, options)
 70 |     write_df_to_catalog(partitions_to_delete, "partitionToDelete", glue_context, options)
 71 | 
 72 | def crawler_undo_options(args):
 73 |     # arguments
 74 |     parser = argparse.ArgumentParser(description='This script allows you to rollback the effects of a crawler.')
 75 |     parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to rollback.')
 76 |     parser.add_argument('-b', '--backup-location', required=False, help='Location of the backup to use. If not specified, no backup is used.')
 77 |     parser.add_argument('-d', '--database-name', required=False, help='Database to roll back. If not specified, '
 78 |                                                                      'the database target of the crawler is used instead.')
 79 |     parser.add_argument('-t', '--timestamp', required=False, help='Timestamp to rollback to, in milliseconds since epoch. If not specified, '
 80 |                                                                   'the start timestamp of the crawler is used instead.')
 81 |     parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional DataCatalog service endpoint region.')
 82 | 
 83 |     options, unknown = parser.parse_known_args(args)
 84 | 
 85 |     if not (options.database_name is not None and options.timestamp is not None):
 86 |         import boto3 # Import is done here to ensure script does not fail in case boto3 is not required.
 87 |         glue_endpoint = DEFAULT_GLUE_ENDPOINT
 88 |         glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
 89 |         crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
 90 | 
 91 |     if options.database_name is not None:
 92 |         database_name = options.database_name
 93 |     else:
 94 |         database_name = crawler['DatabaseName']
 95 | 
 96 |     if options.timestamp is not None:
 97 |         timestamp = options.timestamp
 98 |     else:
 99 |         timestamp = crawler['LastCrawlInfo']['StartTime']
100 | 
101 |     return {
102 |         "catalog.name": DEFAULT_CATALOG_ENDPOINT,
103 |         "catalog.region": options.region,
104 |         "catalog.database": database_name,
105 |         "crawler.name" : options.crawler_name,
106 |         "s3.backup_location" : options.backup_location,
107 |         "timestamp": int(timestamp)
108 |     }
109 | 
110 | def main():
111 | 
112 |     # spark env
113 |     sc = SparkContext()
114 |     glue_context = GlueContext(sc)
115 | 
116 |     crawler_undo(
117 |         glue_context,
118 |         **crawler_undo_options(sys.argv[1:]))
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/awsglue/scripts/scripts_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | import os
 14 | from awsglue.context import GlueContext
 15 | from awsglue.dynamicframe import DynamicFrame
 16 | from awsglue.transforms import get_transform
 17 | from pyspark.sql.types import *
 18 | from pyspark.sql.functions import *
 19 | 
 20 | COLLECT_RESULT_NAME = "collect_list(named_struct(NamePlaceholder(), unresolvedstar()))"
 21 | DEFAULT_CATALOG_ENDPOINT = 'daylight-gamma'
 22 | DEFAULT_GLUE_ENDPOINT = 'glue-beta'
 23 | DEFAULT_REGION = 'us-east-1'
 24 | 
 25 | def write_backup(data, database_name, backup_location, glue_context):
 26 |     nested_tables = nest_data_frame(_order_columns_for_backup(data['table']), database_name, 'table')
 27 |     nested_partitions = nest_data_frame(_order_columns_for_backup(data['partition']), database_name, 'partition')
 28 |     write_df_to_s3(
 29 |             glue_context,
 30 |             nested_tables.withColumn("table",lit("empty")).select(col("table"),("items"),("database"),("type")).union(nested_partitions),
 31 |             backup_location
 32 |     )
 33 | 
 34 | def _order_columns_for_backup(dataframe):
 35 |     return dataframe.select(
 36 |         col('name'),
 37 |         col('description'),
 38 |         col('owner'),
 39 |         col('createTime'),
 40 |         col('updateTime'),
 41 |         col('lastAccessTime'),
 42 |         col('lastAnalyzedTime'),
 43 |         col('retention'),
 44 |         col('storageDescriptor'),
 45 |         col('partitionKeys'),
 46 |         col('tableType'),
 47 |         col('parameters'),
 48 |         col('createdBy'),
 49 |         col('values'),
 50 |         col('namespaceName'),
 51 |         col('tableName'),
 52 |         col('table')
 53 |     )
 54 | 
 55 | def nest_data_frame(data_frame, database_name, entity_type):
 56 |     if entity_type.startswith("table"):
 57 |         # Entity is a table
 58 |         return data_frame.agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type))
 59 |     elif entity_type.startswith("partition"):
 60 |         # Entity is a partition
 61 |         return data_frame.groupBy('tableName').agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type)).withColumnRenamed("tableName","table")
 62 |     elif entity_type.startswith("database"):
 63 |         return data_frame.groupBy().agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("type", lit(entity_type))
 64 |     else:
 65 |         raise Exception("entity_type %s is not recognized, your backup data may be corrupted..." % entity_type)
 66 | 
 67 | def write_df_to_catalog(data_frame, entity_type, glue_context, options):
 68 |     # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
 69 |     if data_frame.rdd.isEmpty():
 70 |         return # nothing to do
 71 |     database_name = options['catalog.database']
 72 |     nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
 73 |     dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
 74 |     sink = glue_context.getSink('catalog', **options)
 75 |     sink.write(dynamic_frame)
 76 | 
 77 | def catalog_dict(data_frame):
 78 |     databases = data_frame.filter("type = 'database'").select(explode(data_frame['items'])).select(col("col.*"))
 79 |     tables = data_frame.filter("type = 'table'").select(explode(data_frame['items'])).select(col("col.*"))
 80 |     table_versions = data_frame.filter("type = 'tableVersion'").select(explode(data_frame['items'])).select(col("col.*"))
 81 |     partitions = data_frame.filter("type = 'partition'").select(explode(data_frame['items'])).select(col("col.*"))
 82 |     tables_to_delete = data_frame.filter("type = 'tableToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
 83 |     partitions_to_delete = data_frame.filter("type = 'partitionToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
 84 |     return {
 85 |         'database' : databases,
 86 |         'table' : tables,
 87 |         'tableVersion' : table_versions,
 88 |         'partition' : partitions,
 89 |         'tableToDelete' : tables_to_delete,
 90 |         'partitionToDelete' : partitions_to_delete
 91 |     }    
 92 | 
 93 | def read_from_catalog(glue_context, options):
 94 |     return catalog_dict(glue_context.create_dynamic_frame_from_options(
 95 |     connection_type="com.amazonaws.services.glue.connections.DataCatalogConnection", connection_options=options).toDF())
 96 | 
 97 | def write_df_to_s3(glue_context, data_frame, backup_location):
 98 |     dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
 99 |     sink = glue_context.getSink("s3", path=backup_location)
100 |     sink.setFormat("json")
101 |     sink.write(dynamic_frame)
102 | 
103 | def read_from_s3(glue_context, backup_location):
104 |     src = glue_context.getSource("file", paths=[backup_location])
105 |     src.setFormat('json')
106 |     return catalog_dict(src.getFrame().toDF())
107 | 


--------------------------------------------------------------------------------
/awsglue/streaming_data_source.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.utils import makeOptions, callsite
14 | from pyspark.sql import DataFrame
15 | 
16 | class StreamingDataSource(object):
17 |     def __init__(self, j_source, sql_ctx, name):
18 |         self._jsource = j_source
19 |         self._sql_ctx = sql_ctx
20 |         self.name = name
21 | 
22 |     def setFormat(self, format, **options):
23 |         options["callSite"] = callsite()
24 |         self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
25 | 
26 |     def getFrame(self):
27 |         jdf = self._jsource.getDataFrame()
28 |         return DataFrame(jdf, self._sql_ctx)
29 | 


--------------------------------------------------------------------------------
/awsglue/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from .transform import GlueTransform
14 | from .unbox import Unbox
15 | from .unnest_frame import UnnestFrame
16 | from .relationalize import Relationalize
17 | from .field_transforms import RenameField, DropFields, SelectFields, SplitFields, SplitRows, Join, Spigot
18 | from .collection_transforms import SelectFromCollection, MapToCollection, FlatMap
19 | from .drop_nulls import DropNullFields
20 | from .apply_mapping import ApplyMapping
21 | from .repartition import Repartition
22 | from .resolve_choice import ResolveChoice
23 | from .errors_as_dynamicframe import ErrorsAsDynamicFrame
24 | from .dynamicframe_filter import Filter
25 | from .dynamicframe_map import Map
26 | from .coalesce import Coalesce
27 | from .union import Union
28 | import json
29 | 
30 | ALL_TRANSFORMS = {Unbox, RenameField, DropFields, SplitFields, SelectFields, SplitRows,
31 |                   UnnestFrame, Relationalize, SelectFromCollection,
32 |                   MapToCollection, ErrorsAsDynamicFrame, FlatMap, DropNullFields,
33 |                   Join, ApplyMapping, Repartition, ResolveChoice, Spigot, Filter, Map, Coalesce, Union}
34 | 
35 | __all__ = [transform.__name__ for transform in ALL_TRANSFORMS]
36 | 
37 | def get_transforms():
38 |     return {transform() for transform in ALL_TRANSFORMS}
39 | 
40 | def get_transform(name):
41 |     transform, = [t for t in get_transforms() if t.name().lower() == name.lower()] or (None,)
42 |     return transform
43 | 
44 | def describe_transform(name):
45 |     transform = get_transform(name)
46 |     description = transform.describe() if transform else {}
47 |     return json.dumps(description, sort_keys=True, indent=4, separators=(',', ': '))
48 | 


--------------------------------------------------------------------------------
/awsglue/transforms/apply_mapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import DropFields, GlueTransform
14 | 
15 | class ApplyMapping(GlueTransform):
16 |     def __call__(self, frame, mappings, case_sensitive = False,
17 |                  transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
18 |         return frame.apply_mapping(mappings, case_sensitive, transformation_ctx,
19 |                                    info, stageThreshold, totalThreshold)
20 | 
21 |     @classmethod
22 |     def describeArgs(cls):
23 |         arg1 = {"name": "frame",
24 |                 "type": "DynamicFrame",
25 |                 "description": "DynamicFrame to transform",
26 |                 "optional": False,
27 |                 "defaultValue": None}
28 |         arg2 = {"name": "mappings",
29 |                 "type": "DynamicFrame",
30 |                 "description": "List of mapping tuples (source col, source type, target col, target type)",
31 |                 "optional": False,
32 |                 "defaultValue": None}
33 |         arg3 = {"name": "case_sensitive",
34 |                 "type": "Boolean",
35 |                 "description": "Whether ",
36 |                 "optional": True,
37 |                 "defaultValue": "False"}
38 |         arg4 = {"name": "transformation_ctx",
39 |                 "type": "String",
40 |                 "description": "A unique string that is used to identify stats / state information",
41 |                 "optional": True,
42 |                 "defaultValue": ""}
43 |         arg5 = {"name": "info",
44 |                 "type": "String",
45 |                 "description": "Any string to be associated with errors in the transformation",
46 |                 "optional": True,
47 |                 "defaultValue": "\"\""}
48 |         arg6 = {"name": "stageThreshold",
49 |                 "type": "Integer",
50 |                 "description": "Max number of errors in the transformation until processing will error out",
51 |                 "optional": True,
52 |                 "defaultValue": "0"}
53 |         arg7 = {"name": "totalThreshold",
54 |                 "type": "Integer",
55 |                 "description": "Max number of errors total until processing will error out.",
56 |                 "optional": True,
57 |                 "defaultValue": "0"}
58 | 
59 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7]
60 | 
61 |     @classmethod
62 |     def describeTransform(cls):
63 |         return "Apply a declarative mapping to this DynamicFrame."
64 | 
65 |     @classmethod
66 |     def describeErrors(cls):
67 |         return []
68 | 
69 |     @classmethod
70 |     def describeReturn(cls):
71 |         return {"type": "DynamicFrame",
72 |                 "description": "DynamicFrame after applying mappings."}
73 | 


--------------------------------------------------------------------------------
/awsglue/transforms/coalesce.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class Coalesce(GlueTransform):
16 |     def __call__(self, frame, num_partitions, shuffle = False, transformation_ctx = "", info = "",
17 |                  stageThreshold = 0, totalThreshold = 0):
18 |         return frame.coalesce(num_partitions, shuffle, transformation_ctx, info, stageThreshold, totalThreshold)
19 | 
20 |     @classmethod
21 |     def describeArgs(cls):
22 |         arg1 = {"name": "num_partitions",
23 |                 "type": "DynamicFrame",
24 |                 "description": "Number of partitions",
25 |                 "optional": False,
26 |                 "defaultValue": None}
27 |         arg2 = {"name": "shuffle",
28 |                 "type": "Boolean",
29 |                 "description": "A boolean indicating whether shuffling enabled for the coalesce process",
30 |                 "optional": True,
31 |                 "defaultValue": False}
32 |         arg3 = {"name": "transformation_ctx",
33 |                 "type": "String",
34 |                 "description": "A unique string that is used to identify stats / state information",
35 |                 "optional": True,
36 |                 "defaultValue": ""}
37 |         arg4 = {"name": "info",
38 |                 "type": "String",
39 |                 "description": "Any string to be associated with errors in the transformation",
40 |                 "optional": True,
41 |                 "defaultValue": "\"\""}
42 |         arg5 = {"name": "stageThreshold",
43 |                 "type": "Integer",
44 |                 "description": "Max number of errors in the transformation until processing will error out",
45 |                 "optional": True,
46 |                 "defaultValue": "0"}
47 |         arg6 = {"name": "totalThreshold",
48 |                 "type": "Integer",
49 |                 "description": "Max number of errors total until processing will error out.",
50 |                 "optional": True,
51 |                 "defaultValue": "0"}
52 | 
53 |         return [arg1, arg2, arg3, arg4, arg5, arg6]
54 | 
55 |     @classmethod
56 |     def describeTransform(cls):
57 |         return "Coalesces a DynamicFrame."
58 | 
59 |     @classmethod
60 |     def describeErrors(cls):
61 |         return []
62 | 
63 |     @classmethod
64 |     def describeReturn(cls):
65 |         return {"type": "DynamicFrame",
66 |                 "description": "The coalesced DynamicFrame."}
67 | 


--------------------------------------------------------------------------------
/awsglue/transforms/collection_transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from awsglue.transforms import GlueTransform
 14 | 
 15 | class SelectFromCollection(GlueTransform):
 16 | 
 17 |     def __call__(self, dfc, key, transformation_ctx = ""):
 18 |         return dfc.select(key, transformation_ctx)
 19 | 
 20 |     @classmethod
 21 |     def describeArgs(cls):
 22 |         arg1 = {"name": "dfc",
 23 |                 "type": "DynamicFrameCollection",
 24 |                 "description": "select one DynamicFrame from this DynamicFrameCollection",
 25 |                 "optional": False,
 26 |                 "defaultValue": None}
 27 | 
 28 |         arg2 = {"name": "key",
 29 |                 "type": "String",
 30 |                 "description": "The key to select",
 31 |                 "optional": False,
 32 |                 "defaultValue": None}
 33 | 
 34 |         arg3 = {"name": "transformation_ctx",
 35 |                 "type": "String",
 36 |                 "description": "A unique string that is used to identify stats / state information",
 37 |                 "optional": True,
 38 |                 "defaultValue": ""}
 39 | 
 40 |         return [arg1, arg2, arg3]
 41 | 
 42 |     @classmethod
 43 |     def describeTransform(cls):
 44 |         return "Select one DynamicFrame out from the DynamicFrameCollection"
 45 | 
 46 |     @classmethod
 47 |     def describeErrors(cls):
 48 |         return []
 49 | 
 50 |     @classmethod
 51 |     def describeReturn(cls):
 52 |         return {"type": "DynamicFrame",
 53 |                 "description": "Dynamic Frame corresponding to name"}
 54 | 
 55 | class MapToCollection(GlueTransform):
 56 | 
 57 |     def __call__(self, dfc, callable, transformation_ctx = ""):
 58 |         return dfc.map(callable, transformation_ctx)
 59 | 
 60 |     @classmethod
 61 |     def describeArgs(cls):
 62 |         arg1 = {"name": "dfc",
 63 |                 "type": "CollectionDynamicFrame",
 64 |                 "description": "apply function on this DynamicFrameCollection",
 65 |                 "optional": False,
 66 |                 "defaultValue": None}
 67 | 
 68 |         arg2 = {"name": "callable",
 69 |                 "type": "Callable",
 70 |                 "description": "apply this Callable on DynamicFrameCollection",
 71 |                 "optional": False,
 72 |                 "defaultValue": None}
 73 | 
 74 |         arg3 = {"name": "transformation_ctx",
 75 |                 "type": "String",
 76 |                 "description": "A unique string that is used to identify stats / state information",
 77 |                 "optional": True,
 78 |                 "defaultValue": ""}
 79 | 
 80 |         return [arg1, arg2, arg3]
 81 | 
 82 |     @classmethod
 83 |     def describeTransform(cls):
 84 |         return "Apply a transform on each DynamicFrame of this DynamicFrameCollection"
 85 | 
 86 |     @classmethod
 87 |     def describeErrors(cls):
 88 |         return []
 89 | 
 90 |     @classmethod
 91 |     def describeReturn(cls):
 92 |         return {"type": "DynamicFrameCollection",
 93 |                 "description": "A new DynamicFrameCollection after apply transform on each element"}
 94 | 
 95 | 
 96 | class FlatMap(GlueTransform):
 97 | 
 98 |     def __call__(self, dfc, BaseTransform, frame_name, transformation_ctx = "", **base_kwargs):
 99 |         args = {}
100 | 
101 |         def apply_inner(frame, transformation_ctx):
102 |             args.clear()
103 |             args.update(base_kwargs)
104 |             args[frame_name] = frame
105 |             args["transformation_ctx"] = transformation_ctx
106 |             return BaseTransform.apply(**args)
107 | 
108 |         return dfc.flatmap(apply_inner, transformation_ctx)
109 | 
110 |     @classmethod
111 |     def describeArgs(cls):
112 |         arg1 = {"name": "dfc",
113 |                 "type": "DynamicFrameCollection",
114 |                 "description": "The collection over which to flatmap.",
115 |                 "optional": False,
116 |                 "defaultValue": None}
117 | 
118 |         arg2 = {"name": "BaseTransform",
119 |                 "type": "GlueTransform",
120 |                 "description": "A GlueTransform to apply to each member of the collection.",
121 |                 "optional": False,
122 |                 "defaultValue": None}
123 | 
124 |         arg3 = {"name": "frame_name",
125 |                 "type": "String",
126 |                 "description": "The argument name to which to pass the elements of the collection.",
127 |                 "optional": False,
128 |                 "defaultValue": None}
129 | 
130 |         arg4 = {"name": "transformation_ctx",
131 |                 "type": "String",
132 |                 "description": "A unique string that is used to identify stats / state information",
133 |                 "optional": True,
134 |                 "defaultValue": ""}
135 | 
136 |         arg5 = {"name": "base_kwargs",
137 |                 "type": "dict",
138 |                 "description": "Arguments to pass to the base transform.",
139 |                 "optional": False,
140 |                 "defaultValue": None}
141 | 
142 |         return [arg1, arg2, arg3, arg4, arg5]
143 | 
144 |     @classmethod
145 |     def describeTransform(cls):
146 |         return "Applies a transform to each DynamicFrame in a collection and flattens the results."
147 | 
148 |     @classmethod
149 |     def describeErrors(cls):
150 |         return []
151 | 
152 |     @classmethod
153 |     def describeReturn(cls):
154 |         return {"type": "DynamicFrameCollection",
155 |                 "description": "A new DynamicFrameCollection after applying the transform on each element"}
156 | 


--------------------------------------------------------------------------------
/awsglue/transforms/drop_nulls.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from __future__ import print_function
14 | from awsglue.transforms import DropFields, GlueTransform
15 | from awsglue.gluetypes import ArrayType, NullType, StructType
16 | 
17 | class DropNullFields(GlueTransform):
18 |     def _find_null_fields(self, ctx, schema, path, output):
19 |         if isinstance(schema, StructType):
20 |             for field in schema:
21 |                 new_path = path + "." if path != "" else path
22 |                 self._find_null_fields(ctx, field.dataType, new_path + ctx._jvm.RecordUtils.quoteName(field.name), output)
23 | 
24 |         elif isinstance(schema, ArrayType):
25 |             # For the moment we only remove null fields in nested array columns.
26 |             # We don't change ArrayType(NullType).
27 |             if isinstance(schema.elementType, StructType):
28 |                 self._find_null_fields(ctx, schema.elementType, path, output)
29 | 
30 |         elif isinstance(schema, NullType):
31 |             output.append(path)
32 | 
33 |         # Note: dropFields currently does not work through maps,
34 |         # so neither does DropNullFields
35 | 
36 |     def __call__(self, frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
37 |         null_fields = []
38 |         self._find_null_fields(frame.glue_ctx, frame.schema(), "", null_fields)
39 |         print("null_fields", null_fields)
40 | 
41 |         return DropFields.apply(frame, null_fields, transformation_ctx,
42 |                                 info, stageThreshold, totalThreshold)
43 | 
44 |     @classmethod
45 |     def describeArgs(cls):
46 |         arg1 = {"name": "frame",
47 |                 "type": "DynamicFrame",
48 |                 "description": "Drop all null fields in this DynamicFrame",
49 |                 "optional": False,
50 |                 "defaultValue": None}
51 |         arg2 = {"name": "transformation_ctx",
52 |                 "type": "String",
53 |                 "description": "A unique string that is used to identify stats / state information",
54 |                 "optional": True,
55 |                 "defaultValue": ""}
56 |         arg3 = {"name": "info",
57 |                 "type": "String",
58 |                 "description": "Any string to be associated with errors in the transformation",
59 |                 "optional": True,
60 |                 "defaultValue": "\"\""}
61 |         arg4 = {"name": "stageThreshold",
62 |                 "type": "Integer",
63 |                 "description": "Max number of errors in the transformation until processing will error out",
64 |                 "optional": True,
65 |                 "defaultValue": "0"}
66 |         arg5 = {"name": "totalThreshold",
67 |                 "type": "Integer",
68 |                 "description": "Max number of errors total until processing will error out.",
69 |                 "optional": True,
70 |                 "defaultValue": "0"}
71 | 
72 |         return [arg1, arg2, arg3, arg4, arg5]
73 | 
74 |     @classmethod
75 |     def describeTransform(cls):
76 |         return "Drop all null fields in this DynamicFrame"
77 | 
78 |     @classmethod
79 |     def describeErrors(cls):
80 |         return []
81 | 
82 |     @classmethod
83 |     def describeReturn(cls):
84 |         return {"type": "DynamicFrame",
85 |                 "description": "DynamicFrame without null fields."}
86 | 


--------------------------------------------------------------------------------
/awsglue/transforms/dynamicframe_filter.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | 
16 | class Filter(GlueTransform):
17 |     def __call__(self, frame, f, transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
18 |         return frame.filter(f, transformation_ctx, info, stageThreshold, totalThreshold)
19 | 
20 |     @classmethod
21 |     def describeArgs(cls):
22 |         arg1 = {"name": "frame",
23 |                 "type": "DynamicFrame",
24 |                 "description": "The DynamicFrame to apply the Filter function",
25 |                 "optional": False,
26 |                 "defaultValue": None}
27 |         arg2 = {"name": "f",
28 |                 "type": "Function",
29 |                 "description": "Predicate function to call on the DynamicFrame. The function takes DynamicRecord as the argument and returns True/False",
30 |                 "optional": False,
31 |                 "defaultValue": None}
32 |         arg3 = {"name": "transformation_ctx",
33 |                 "type": "String",
34 |                 "description": "A unique string that is used to identify stats / state information",
35 |                 "optional": True,
36 |                 "defaultValue": ""}
37 |         arg4 = {"name": "info",
38 |                 "type": "String",
39 |                 "description": "Any string to be associated with errors in the transformation",
40 |                 "optional": True,
41 |                 "defaultValue": "\"\""}
42 |         arg5 = {"name": "stageThreshold",
43 |                 "type": "Integer",
44 |                 "description": "Max number of errors in the transformation until processing will error out",
45 |                 "optional": True,
46 |                 "defaultValue": "0"}
47 |         arg6 = {"name": "totalThreshold",
48 |                 "type": "Integer",
49 |                 "description": "Max number of errors total until processing will error out.",
50 |                 "optional": True,
51 |                 "defaultValue": "0"}
52 | 
53 |         return [arg1, arg2, arg3, arg4, arg5, arg6]
54 | 
55 |     @classmethod
56 |     def describeTransform(cls):
57 |         return "Builds a new DynamicFrame by selecting records from the input frame that satisfy the predicate function"
58 | 
59 |     @classmethod
60 |     def describeErrors(cls):
61 |         return []
62 | 
63 |     @classmethod
64 |     def describeReturn(cls):
65 |         return {"type": "DynamicFrame",
66 |                 "description": "new DynamicFrame with DynamicRecords that matched the predicate"}


--------------------------------------------------------------------------------
/awsglue/transforms/dynamicframe_map.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | 
16 | class Map(GlueTransform):
17 |     def __call__(self, frame, f, preservesPartitioning = False,transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
18 |         return frame.map(f, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
19 | 
20 |     @classmethod
21 |     def describeArgs(cls):
22 |         arg1 = {"name": "frame",
23 |                 "type": "DynamicFrame",
24 |                 "description": "The DynamicFrame to apply the Map function",
25 |                 "optional": False,
26 |                 "defaultValue": None}
27 |         arg2 = {"name": "f",
28 |                 "type": "Function",
29 |                 "description": "Function to apply on records in the DynamicFrame. The function takes a DynamicRecord as an argument and returns a DynamicRecord",
30 |                 "optional": False,
31 |                 "defaultValue": None}
32 |         arg3 = {"name": "preservesPartitioning",
33 |                 "type": "Boolean",
34 |                 "description": "Whether to preserve the partitioning in the DynamicFrame.",
35 |                 "optional": True,
36 |                 "defaultValue": False}
37 |         arg4 = {"name": "transformation_ctx",
38 |                 "type": "String",
39 |                 "description": "A unique string that is used to identify stats / state information",
40 |                 "optional": True,
41 |                 "defaultValue": ""}
42 |         arg5 = {"name": "info",
43 |                 "type": "String",
44 |                 "description": "Any string to be associated with errors in the transformation",
45 |                 "optional": True,
46 |                 "defaultValue": "\"\""}
47 |         arg6 = {"name": "stageThreshold",
48 |                 "type": "Integer",
49 |                 "description": "Max number of errors in the transformation until processing will error out",
50 |                 "optional": True,
51 |                 "defaultValue": "0"}
52 |         arg7 = {"name": "totalThreshold",
53 |                 "type": "Integer",
54 |                 "description": "Max number of errors total until processing will error out.",
55 |                 "optional": True,
56 |                 "defaultValue": "0"}
57 | 
58 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7]
59 | 
60 | 
61 |     @classmethod
62 |     def describeTransform(cls):
63 |         return "Builds a new DynamicFrame by applying a function to all records in the input DynamicFrame"
64 | 
65 |     @classmethod
66 |     def describeErrors(cls):
67 |         return []
68 | 
69 |     @classmethod
70 |     def describeReturn(cls):
71 |         return {"type": "DynamicFrame",
72 |                 "description": "New DynamicFrame with DynamicRecords as a result of a function"}


--------------------------------------------------------------------------------
/awsglue/transforms/errors_as_dynamicframe.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class ErrorsAsDynamicFrame(GlueTransform):
16 | 
17 |     def __call__(self, frame):
18 |         """
19 |         Returns a DynamicFrame which has error records leading up to the source DynmaicFrame, nested in the returned DynamicFrame.
20 | 
21 |         :param frame: Source dynamicFrame
22 |         """
23 |         return frame.errorsAsDynamicFrame()
24 | 
25 |     @classmethod
26 |     def describeArgs(cls):
27 |         arg1 = {"name": "frame",
28 |                 "type": "DynamicFrame",
29 |                 "description": "The DynamicFrame on which to call errorsAsDynamicFrame",
30 |                 "optional": False,
31 |                 "defaultValue": None}
32 |         return [arg1]
33 | 
34 |     @classmethod
35 |     def describeTransform(cls):
36 |         return "Get error records leading up to the source DynmaicFrame"
37 | 
38 |     @classmethod
39 |     def describeErrors(cls):
40 |         return []
41 | 
42 |     @classmethod
43 |     def describeReturn(cls):
44 |         return {"type": "DynamicFrame",
45 |                 "description": "new DynamicFrame with error DynamicRecords"}
46 | 


--------------------------------------------------------------------------------
/awsglue/transforms/field_transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from awsglue.transforms import GlueTransform
 14 | 
 15 | class RenameField(GlueTransform):
 16 |     """
 17 |     Rename a node within a DynamicFrame
 18 | 
 19 |     :param frame: DynamicFrame
 20 |     :param oldName: String, full path to the node you want to rename
 21 |     :param newName: String, new name including full path
 22 |     :param info: String, any string to be associated with errors in this transformation.
 23 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
 24 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
 25 |       for which the processing needs to error out.
 26 |     :return: DynamicFrame
 27 |     """
 28 | 
 29 |     def __call__(self, frame, old_name, new_name, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
 30 |         return frame.rename_field(old_name, new_name, transformation_ctx, info, stageThreshold, totalThreshold)
 31 | 
 32 |     @classmethod
 33 |     def describeArgs(cls):
 34 |         arg1 = {"name": "frame",
 35 |                 "type": "DynamicFrame",
 36 |                 "description": "The DynamicFrame on which to rename a field",
 37 |                 "optional": False,
 38 |                 "defaultValue": None}
 39 |         arg2 = {"name": "old_name",
 40 |                 "type": "String",
 41 |                 "description": "Full path to the node to rename",
 42 |                 "optional": False,
 43 |                 "defaultValue": None}
 44 |         arg3 = {"name": "new_name",
 45 |                 "type": "String",
 46 |                 "description": "New name, including full path",
 47 |                 "optional": False,
 48 |                 "defaultValue": None}
 49 |         arg4 = {"name": "transformation_ctx",
 50 |                 "type": "String",
 51 |                 "description": "A unique string that is used to identify stats / state information",
 52 |                 "optional": True,
 53 |                 "defaultValue": ""}
 54 |         arg5 = {"name": "info",
 55 |                 "type": "String",
 56 |                 "description": "Any string to be associated with errors in the transformation",
 57 |                 "optional": True,
 58 |                 "defaultValue": "\"\""}
 59 |         arg6 = {"name": "stageThreshold",
 60 |                 "type": "Integer",
 61 |                 "description": "Max number of errors in the transformation until processing will error out",
 62 |                 "optional": True,
 63 |                 "defaultValue": "0"}
 64 |         arg7 = {"name": "totalThreshold",
 65 |                 "type": "Integer",
 66 |                 "description": "Max number of errors total until processing will error out.",
 67 |                 "optional": True,
 68 |                 "defaultValue": "0"}
 69 | 
 70 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7]
 71 | 
 72 |     @classmethod
 73 |     def describeTransform(cls):
 74 |         return "Rename a node within a DynamicFrame"
 75 | 
 76 |     @classmethod
 77 |     def describeErrors(cls):
 78 |         return []
 79 | 
 80 |     @classmethod
 81 |     def describeReturn(cls):
 82 |         return {"type": "DynamicFrame",
 83 |                 "description": "new DynamicFrame with indicated field renamed"}
 84 | 
 85 | 
 86 | class DropFields(GlueTransform):
 87 |     """
 88 |     Drop fields within a DynamicFrame
 89 | 
 90 |     :param frame: DynamicFrame
 91 |     :param paths: List of Strings, each the full path to a node you want to drop
 92 |     :param info: String, any string to be associated with errors in this transformation.
 93 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
 94 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
 95 |       for which the processing needs to error out.
 96 |     :return: DynamicFrame
 97 |     """
 98 | 
 99 |     def __call__(self, frame, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
100 |         return frame.drop_fields(paths, transformation_ctx, info, stageThreshold, totalThreshold)
101 | 
102 |     @classmethod
103 |     def describeArgs(cls):
104 |         arg1 = {"name": "frame",
105 |                 "type": "DynamicFrame",
106 |                 "description": "The DynamicFrame from which to drop fields",
107 |                 "optional": False,
108 |                 "defaultValue": None}
109 |         arg2 = {"name": "paths",
110 |                 "type": "List[String]",
111 |                 "description": "full paths corresponding to nodes to drop",
112 |                 "optional": False,
113 |                 "defaultValue": None}
114 |         arg3 = {"name": "transformation_ctx",
115 |                 "type": "String",
116 |                 "description": "A unique string that is used to identify stats / state information",
117 |                 "optional": True,
118 |                 "defaultValue": ""}
119 |         arg4 = {"name": "info",
120 |                 "type": "String",
121 |                 "description": "Any string to be associated with errors in the transformation",
122 |                 "optional": True,
123 |                 "defaultValue": "\"\""}
124 |         arg5 = {"name": "stageThreshold",
125 |                 "type": "Integer",
126 |                 "description": "Max number of errors in the transformation until processing will error out",
127 |                 "optional": True,
128 |                 "defaultValue": "0"}
129 |         arg6 = {"name": "totalThreshold",
130 |                 "type": "Integer",
131 |                 "description": "Max number of errors total until processing will error out.",
132 |                 "optional": True,
133 |                 "defaultValue": "0"}
134 | 
135 |         return [arg1, arg2, arg3, arg4, arg5, arg6]
136 | 
137 |     @classmethod
138 |     def describeTransform(cls):
139 |         return "Drop fields from a DynamicFrame"
140 | 
141 |     @classmethod
142 |     def describeErrors(cls):
143 |         return []
144 | 
145 |     @classmethod
146 |     def describeReturn(cls):
147 |         return {"type": "DynamicFrame",
148 |                 "description": "new DynamicFrame without indicated fields"}
149 | 
150 | 
151 | class SelectFields(GlueTransform):
152 |     """
153 |     Get fields within a DynamicFrame
154 | 
155 |     :param frame: DynamicFrame
156 |     :param paths: List of Strings, each the full path to a node you want to get
157 |     :param info: String, any string to be associated with errors in this transformation.
158 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
159 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
160 |       for which the processing needs to error out.
161 |     :return: DynamicFrame
162 |     """
163 | 
164 |     def __call__(self, frame, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
165 |         return frame.select_fields(paths, transformation_ctx, info, stageThreshold, totalThreshold)
166 | 
167 |     @classmethod
168 |     def describeArgs(cls):
169 |         arg1 = {"name": "frame",
170 |                 "type": "DynamicFrame",
171 |                 "description": "The DynamicFrame from which to select fields",
172 |                 "optional": False,
173 |                 "defaultValue": None}
174 |         arg2 = {"name": "paths",
175 |                 "type": "List[String]",
176 |                 "description": "full paths corresponding to nodes to select",
177 |                 "optional": False,
178 |                 "defaultValue": None}
179 |         arg3 = {"name": "transformation_ctx",
180 |                 "type": "String",
181 |                 "description": "A unique string that is used to identify stats / state information",
182 |                 "optional": True,
183 |                 "defaultValue": ""}
184 |         arg4 = {"name": "info",
185 |                 "type": "String",
186 |                 "description": "Any string to be associated with errors in the transformation",
187 |                 "optional": True,
188 |                 "defaultValue": "\"\""}
189 |         arg5 = {"name": "stageThreshold",
190 |                 "type": "Integer",
191 |                 "description": "Max number of errors in the transformation until processing will error out",
192 |                 "optional": True,
193 |                 "defaultValue": "0"}
194 |         arg6 = {"name": "totalThreshold",
195 |                 "type": "Integer",
196 |                 "description": "Max number of errors total until processing will error out.",
197 |                 "optional": True,
198 |                 "defaultValue": "0"}
199 | 
200 |         return [arg1, arg2, arg3, arg4, arg5, arg6]
201 | 
202 |     @classmethod
203 |     def describeTransform(cls):
204 |         return "Select fields from a DynamicFrame"
205 | 
206 |     @classmethod
207 |     def describeErrors(cls):
208 |         return []
209 | 
210 |     @classmethod
211 |     def describeReturn(cls):
212 |         return {"type": "DynamicFrame",
213 |                 "description": "new DynamicFrame with only indicated fields"}
214 | 
215 | 
216 | class SplitFields(GlueTransform):
217 |     """
218 |     Split fields within a DynamicFrame
219 | 
220 |     :param frame: DynamicFrame
221 |     :param paths: List of Strings, each the full path to a node that you would like
222 |       to split into a new frame
223 |     :param info: String, any string to be associated with errors in this transformation.
224 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
225 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
226 |       for which the processing needs to error out.
227 |     :return: DynamicFrameCollection with two Dynamic Frames, the first containing all the fields that you have
228 |       split off, and the second containing the remaining fields
229 |     """
230 | 
231 |     def __call__(self, frame, paths, name1 = None, name2 = None, transformation_ctx = "",  info = "", stageThreshold = 0, totalThreshold = 0):
232 |         # Incorporate the existing DynamicFrame name into the new names.
233 |         frame_name = frame.name if len(frame.name) > 0 else "frame"
234 | 
235 |         if name1 == None:
236 |             name1 = frame_name + "1"
237 |         if name2 == None:
238 |             name2 = frame_name + "2"
239 | 
240 |         return frame.split_fields(paths, name1, name2, transformation_ctx, info, stageThreshold, totalThreshold)
241 | 
242 |     @classmethod
243 |     def describeArgs(cls):
244 |         arg1 = {"name": "frame",
245 |                 "type": "DynamicFrame",
246 |                 "description": "DynamicFrame from which to split fields",
247 |                 "optional": False,
248 |                 "defaultValue": None}
249 |         arg2 = {"name": "paths",
250 |                 "type": "List[String]",
251 |                 "description": "full paths corresponding to nodes to split into new DynamicFrame",
252 |                 "optional": False,
253 |                 "defaultValue": None}
254 |         arg3 = {"name": "frame1",
255 |                 "type": "String",
256 |                 "description": "name for the dynamic frame to be split off",
257 |                 "optional": True,
258 |                 "defaultValue": "frame1"}
259 |         arg4 = {"name": "frame2",
260 |                 "type": "String",
261 |                 "description": "name for the dynamic frame remains on original",
262 |                 "optional": True,
263 |                 "defaultValue": "frame2"}
264 |         arg5 = {"name": "transformation_ctx",
265 |                 "type": "String",
266 |                 "description": "A unique string that is used to identify stats / state information",
267 |                 "optional": True,
268 |                 "defaultValue": ""}
269 |         arg6 = {"name": "info",
270 |                 "type": "String",
271 |                 "description": "Any string to be associated with errors in the transformation",
272 |                 "optional": True,
273 |                 "defaultValue": "\"\""}
274 |         arg7 = {"name": "stageThreshold",
275 |                 "type": "Integer",
276 |                 "description": "Max number of errors in the transformation until processing will error out",
277 |                 "optional": True,
278 |                 "defaultValue": "0"}
279 |         arg8 = {"name": "totalThreshold",
280 |                 "type": "Integer",
281 |                 "description": "Max number of errors total until processing will error out.",
282 |                 "optional": True,
283 |                 "defaultValue": "0"}
284 | 
285 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8]
286 | 
287 |     @classmethod
288 |     def describeTransform(cls):
289 |         return "Split fields within a DynamicFrame"
290 | 
291 |     @classmethod
292 |     def describeErrors(cls):
293 |         return []
294 | 
295 |     @classmethod
296 |     def describeReturn(cls):
297 |         desc = "[new DynamicFrame with only indicated fields, new DynamicFrame without indicated fields]"
298 |         return {"type": "DynamicFrameCollection",
299 |                 "description": desc}
300 | 
301 | class SplitRows(GlueTransform):
302 |     """
303 |     Split rows within a DynamicFrame
304 | 
305 |     :param frame: DynamicFrame
306 |     :param comparison_dict: a dictionary where the key is the path to a column,
307 |       the the value is another dictionary maping comparators to the value to which the column 
308 |       will be compared, e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20
309 |       exclusive split from rows that do not meet this criteria
310 |     :param info: String, any string to be associated with errors in this transformation.
311 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
312 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
313 |       for which the processing needs to error out.
314 |     :return: A DynamicFrameCollection with two Dynamic Frames, the first containing all the rows that you have
315 |       split off, and the second containing the remaining rows
316 |     """
317 | 
318 |     def __call__(self, frame, comparison_dict, name1 = "frame1", name2 = "frame2", transformation_ctx = "", info = None, stageThreshold = 0, totalThreshold = 0):
319 |         info = info or ""
320 |         return frame.split_rows(comparison_dict, name1, name2, transformation_ctx, info, stageThreshold, totalThreshold)
321 | 
322 |     @classmethod
323 |     def describeArgs(cls):
324 |         arg1 = {"name": "frame",
325 |                 "type": "DynamicFrame",
326 |                 "description": "DynamicFrame from which to split rows",
327 |                 "optional": False,
328 |                 "defaultValue": None}
329 |         arg2 = {"name": "comparison_dict",
330 |                 "type": "Dictionary, {String 'path to node': {String 'operator': String or Integer 'value'}}",
331 |                 "description": "{paths to columns: {comparators: value to which each the column will be compared.}}\
332 |                  Example: {'age': {'>': 10, '<': 20}} will give back rows where age is between 10 and 20 exclusive, \
333 |                  and rows where this criteria is not met",
334 |                 "optional": False,
335 |                 "defaultValue": None}
336 |         arg3 = {"name": "frame1",
337 |                 "type": "String",
338 |                 "description": "name for the dynamic frame to be split off",
339 |                 "optional": True,
340 |                 "defaultValue": "frame1"}
341 |         arg4 = {"name": "frame2",
342 |                 "type": "String",
343 |                 "description": "name for the dynamic frame remains on original",
344 |                 "optional": True,
345 |                 "defaultValue": "frame2"}
346 |         arg5 = {"name": "transformation_ctx",
347 |                 "type": "String",
348 |                 "description": "A unique string that is used to identify stats / state information",
349 |                 "optional": True,
350 |                 "defaultValue": ""}
351 |         arg6 = {"name": "info",
352 |                 "type": "String",
353 |                 "description": "Any string to be associated with errors in the transformation",
354 |                 "optional": True,
355 |                 "defaultValue": None}
356 |         arg7 = {"name": "stageThreshold",
357 |                 "type": "Integer",
358 |                 "description": "Max number of errors in the transformation until processing will error out",
359 |                 "optional": True,
360 |                 "defaultValue": "0"}
361 |         arg8 = {"name": "totalThreshold",
362 |                 "type": "Integer",
363 |                 "description": "Max number of errors total until processing will error out.",
364 |                 "optional": True,
365 |                 "defaultValue": "0"}
366 | 
367 | 
368 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8]
369 | 
370 |     @classmethod
371 |     def describeTransform(cls):
372 |         return "Split rows within a DynamicFrame based on comparators"
373 | 
374 |     @classmethod
375 |     def describeErrors(cls):
376 |         return []
377 | 
378 |     @classmethod
379 |     def describeReturn(cls):
380 |         desc = "DynamicFrameCollection[new DynamicFrame with only indicated rows, new DynamicFrame without indicated rows]"
381 |         return {"type": "DynamicFrameCollection",
382 |                 "description": desc}
383 | 
384 | class Join(GlueTransform):
385 | 
386 |     def __call__(self, frame1, frame2,  keys1, keys2, transformation_ctx = ""):
387 |         return frame1.join(keys1, keys2, frame2)
388 | 
389 |     @classmethod
390 |     def describeArgs(cls):
391 |         arg1 = {"name": "frame1",
392 |                 "type": "DynamicFrame",
393 |                 "description": "join this DynamicFrame",
394 |                 "optional": False,
395 |                 "defaultValue": None}
396 | 
397 |         arg1 = {"name": "frame2",
398 |                 "type": "DynamicFrame",
399 |                 "description": "join with this DynamicFrame",
400 |                 "optional": False,
401 |                 "defaultValue": None}
402 | 
403 |         arg2 = {"name": "keys1",
404 |                 "type": "String",
405 |                 "description": "The keys to join on for the first frame",
406 |                 "optional": False,
407 |                 "defaultValue": None}
408 | 
409 |         arg3 = {"name": "keys2",
410 |                 "type": "String",
411 |                 "description": "The keys to join on for the second frame",
412 |                 "optional": False,
413 |                 "defaultValue": None}
414 | 
415 |         return [arg1, arg2, arg3, arg4]
416 | 
417 |     @classmethod
418 |     def describeTransform(cls):
419 |         return "equality join two dynamic frames DynamicFrames"
420 | 
421 |     @classmethod
422 |     def describeErrors(cls):
423 |         return []
424 | 
425 |     @classmethod
426 |     def describeReturn(cls):
427 |         return {"type": "DynamicFrame",
428 |                 "description": "DynamicFrame obtained by joining two frames"}
429 |     
430 | 
431 | class Spigot(GlueTransform):
432 | 
433 |     def __call__(self, frame, path, options, transformation_ctx = ""):
434 |         return frame.spigot(path,options,transformation_ctx)
435 | 
436 |     @classmethod
437 |     def describeArgs(cls):
438 |         arg1 = {"name": "frame",
439 |                 "type": "DynamicFrame",
440 |                 "description": "spigot this DynamicFrame",
441 |                 "optional": False,
442 |                 "defaultValue": None}
443 | 
444 |         arg2 = {"name": "path",
445 |                 "type": "string",
446 |                 "description": "file path to write spigot",
447 |                 "optional": False,
448 |                 "defaultValue": None}
449 | 
450 |         arg3 = {"name": "options",
451 |                 "type": "Json",
452 |                 "description": "topk -> first k records, prob -> probability of picking any record",
453 |                 "optional": True,
454 |                 "defaultValue": None}
455 | 
456 |         return [arg1, arg2, arg3]
457 | 
458 |     @classmethod
459 |     def describeTransform(cls):
460 |         return "write sample records to path destination mid transformation"
461 | 
462 |     @classmethod
463 |     def describeErrors(cls):
464 |         return []
465 | 
466 |     @classmethod
467 |     def describeReturn(cls):
468 |         return {"type": "DynamicFrame",
469 |                 "description": "DynamicFrame is the same as the infput dynamicFrame with an additional write step"}
470 | 


--------------------------------------------------------------------------------
/awsglue/transforms/relationalize.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from awsglue.transforms import GlueTransform
 14 | from awsglue.utils import _global_args
 15 | 
 16 | class Relationalize(GlueTransform):
 17 |     """
 18 |     Relationalizes a dynamic frame. i.e. produces a list of frames that are
 19 |     generated by unnesting nested columns and pivoting array columns. The
 20 |     pivoted array column can be joined to the root table using the joinkey
 21 |     generated in unnest phase
 22 |     :param frame: DynamicFrame to relationalize
 23 |     :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
 24 |         this path
 25 |     :param name: name for the root table
 26 |     :param options: dict of optional parameters for relationalize
 27 |     :param transformation_ctx: context key to retrieve metadata about the current transformation
 28 |     :param info: String, any string to be associated with errors in this transformation.
 29 |     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
 30 |     :param totalThreshold: Long, total number of errors upto and including in this transformation
 31 |       for which the processing needs to error out.
 32 |     :return: DynamicFrameCollection
 33 |     """
 34 | 
 35 |     # TODO: Make staging_path a mandatory argument
 36 |     def __call__(self, frame, staging_path=None, name='roottable', options=None, transformation_ctx = "", info="",
 37 |         stageThreshold=0, totalThreshold=0):
 38 |         options = options or {}
 39 |         # TODO: Remove special handling of staging_path and make it mandatory after TempDir is made a mandatory argument
 40 |         # We are directly accessing the args variable assuming that it is available in the global scope. This is to
 41 |         # maintain backward compatibility with the relationalize call that did not have the mandatory staging_path arg
 42 |         if staging_path is None:
 43 |             if _global_args['TempDir'] is not None and _global_args['TempDir'] != "":
 44 |                 staging_path = _global_args['TempDir']
 45 |             else:
 46 |                 raise RuntimeError("Unable to set staging_path using args "+str(_global_args))
 47 |         return frame.relationalize(name, staging_path, options, transformation_ctx, info, stageThreshold, totalThreshold)
 48 | 
 49 |     @classmethod
 50 |     def describeArgs(cls):
 51 |         arg1 = {"name": "frame",
 52 |                 "type": "DynamicFrame",
 53 |                 "description": "The DynamicFrame to relationalize",
 54 |                 "optional": False,
 55 |                 "defaultValue": None}
 56 |         arg2 = {"name": "staging_path",
 57 |                 "type": "String",
 58 |                 "description": "path to store partitions of pivoted tables in csv format",
 59 |                 "optional": True,
 60 |                 "defaultValue": None}
 61 |         arg3 = {"name": "name",
 62 |                 "type": "String",
 63 |                 "description": "Name of the root table",
 64 |                 "optional": True,
 65 |                 "defaultValue": "roottable"}
 66 |         arg4 = {"name": "options",
 67 |                 "type": "Dictionary",
 68 |                 "description": "dict of optional parameters for relationalize",
 69 |                 "optional": True,
 70 |                 "defaultValue": "{}"}
 71 |         arg5 = {"name": "transformation_ctx",
 72 |                 "type": "String",
 73 |                 "description": "A unique string that is used to identify stats / state information",
 74 |                 "optional": True,
 75 |                 "defaultValue": ""}
 76 |         arg6 = {"name": "info",
 77 |                 "type": "String",
 78 |                 "description": "Any string to be associated with errors in the transformation",
 79 |                 "optional": True,
 80 |                 "defaultValue": "\"\""}
 81 |         arg7 = {"name": "stageThreshold",
 82 |                 "type": "Integer",
 83 |                 "description": "Max number of errors in the transformation until processing will error out",
 84 |                 "optional": True,
 85 |                 "defaultValue": "0"}
 86 |         arg8 = {"name": "totalThreshold",
 87 |                 "type": "Integer",
 88 |                 "description": "Max number of errors total until processing will error out.",
 89 |                 "optional": True,
 90 |                 "defaultValue": "0"}
 91 | 
 92 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8]
 93 | 
 94 |     @classmethod
 95 |     def describeTransform(cls):
 96 |         return "Flatten nested schema and pivot out array columns from the flattened frame"
 97 | 
 98 |     @classmethod
 99 |     def describeErrors(cls):
100 |         return []
101 | 
102 |     @classmethod
103 |     def describeReturn(cls):
104 |         return {"type": "DynamicFrameCollection",
105 |                 "description": "DynamicFrameCollection resulting from Relationalize call"}
106 | 


--------------------------------------------------------------------------------
/awsglue/transforms/repartition.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class Repartition(GlueTransform):
16 |     def __call__(self, frame, num_partitions, transformation_ctx = "", info = "",
17 |                  stageThreshold = 0, totalThreshold = 0):
18 |         return frame.repartition(num_partitions, transformation_ctx, info, stageThreshold, totalThreshold)
19 | 
20 |     @classmethod
21 |     def describeArgs(cls):
22 |         arg1 = {"name": "num_partitions",
23 |                 "type": "DynamicFrame",
24 |                 "description": "Number of partitions",
25 |                 "optional": False,
26 |                 "defaultValue": None}
27 |         arg2 = {"name": "transformation_ctx",
28 |                 "type": "String",
29 |                 "description": "A unique string that is used to identify stats / state information",
30 |                 "optional": True,
31 |                 "defaultValue": ""}
32 |         arg3 = {"name": "info",
33 |                 "type": "String",
34 |                 "description": "Any string to be associated with errors in the transformation",
35 |                 "optional": True,
36 |                 "defaultValue": "\"\""}
37 |         arg4 = {"name": "stageThreshold",
38 |                 "type": "Integer",
39 |                 "description": "Max number of errors in the transformation until processing will error out",
40 |                 "optional": True,
41 |                 "defaultValue": "0"}
42 |         arg5 = {"name": "totalThreshold",
43 |                 "type": "Integer",
44 |                 "description": "Max number of errors total until processing will error out.",
45 |                 "optional": True,
46 |                 "defaultValue": "0"}
47 | 
48 |         return [arg1, arg2, arg3, arg4, arg5]
49 | 
50 |     @classmethod
51 |     def describeTransform(cls):
52 |         return "Repartitions a DynamicFrame."
53 | 
54 |     @classmethod
55 |     def describeErrors(cls):
56 |         return []
57 | 
58 |     @classmethod
59 |     def describeReturn(cls):
60 |         return {"type": "DynamicFrame",
61 |                 "description": "The repartitioned DynamicFrame."}
62 | 


--------------------------------------------------------------------------------
/awsglue/transforms/resolve_choice.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class ResolveChoice(GlueTransform):
16 |     def __call__(self, frame, specs=None, choice="", database=None, table_name=None, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0, catalog_id=None):
17 |         return frame.resolveChoice(specs, choice, database, table_name, transformation_ctx, info, stageThreshold, totalThreshold, catalog_id)
18 | 
19 |     @classmethod
20 |     def describeArgs(cls):
21 |         arg1 = {"name": "frame",
22 |                 "type": "DynamicFrame",
23 |                 "description": "DynamicFrame to transform",
24 |                 "optional": False,
25 |                 "defaultValue": None}
26 |         arg2 = {"name": "specs",
27 |                 "type": "List",
28 |                 "description": "List of specs (path, action)",
29 |                 "optional": True,
30 |                 "defaultValue": None}
31 |         arg3 = {"name": "choice",
32 |                 "type": "String",
33 |                 "description": "resolve choice option",
34 |                 "optional": True,
35 |                 "defaultValue": ""}
36 |         arg4 = {"name": "database",
37 |                 "type": "String",
38 |                 "description": "Glue catalog database name, required for MATCH_CATALOG choice",
39 |                 "optional": True,
40 |                 "defaultValue": ""}
41 |         arg5 = {"name": "table_name",
42 |                 "type": "String",
43 |                 "description": "Glue catalog table name, required for MATCH_CATALOG choice",
44 |                 "optional": True,
45 |                 "defaultValue": ""}
46 |         arg6 = {"name": "transformation_ctx",
47 |                 "type": "String",
48 |                 "description": "A unique string that is used to identify stats / state information",
49 |                 "optional": True,
50 |                 "defaultValue": ""}
51 |         arg7 = {"name": "info",
52 |                 "type": "String",
53 |                 "description": "Any string to be associated with errors in the transformation",
54 |                 "optional": True,
55 |                 "defaultValue": "\"\""}
56 |         arg8 = {"name": "stageThreshold",
57 |                 "type": "Integer",
58 |                 "description": "Max number of errors in the transformation until processing will error out",
59 |                 "optional": True,
60 |                 "defaultValue": "0"}
61 |         arg9 = {"name": "totalThreshold",
62 |                 "type": "Integer",
63 |                 "description": "Max number of errors total until processing will error out.",
64 |                 "optional": True,
65 |                 "defaultValue": "0"}
66 |         arg10 = {"name": "catalog_id",
67 |                 "type": "String",
68 |                 "description": "Catalog id for match_catalog id.",
69 |                 "optional": True,
70 |                 "defaultValue": "accountId"}
71 | 
72 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10]
73 | 
74 |     @classmethod
75 |     def describeTransform(cls):
76 |         return "Resolve choice type in this DynamicFrame."
77 | 
78 |     @classmethod
79 |     def describeErrors(cls):
80 |         return []
81 | 
82 |     @classmethod
83 |     def describeReturn(cls):
84 |         return {"type": "DynamicFrame",
85 |                 "description": "DynamicFrame after resolving choice type."}
86 | 


--------------------------------------------------------------------------------
/awsglue/transforms/transform.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | class GlueTransform(object):
14 |     """Base class for all Glue Transforms.
15 | 
16 |     All Glue transformations should inherit from GlueTransform and define a
17 |     __call__ method. They can optionally override the name classmethod or use
18 |     the default of the class name.
19 |     """
20 | 
21 |     @classmethod
22 |     def apply(cls, *args, **kwargs):
23 |         transform = cls()
24 |         return transform(*args, **kwargs)
25 | 
26 |     @classmethod
27 |     def name(cls):
28 |         return cls.__name__
29 | 
30 |     @classmethod
31 |     def describeArgs(cls):
32 |         '''
33 |         Returns: a list of dictionaries, with each corresponding to
34 |         an argument, in the following format:
35 |                 [{"name": "<name of argument>",
36 |                  "type": "<type of argument>",
37 |                  "description": "<description of argument>",
38 |                  "optional": "<Boolean>",
39 |                  "defaultValue": "<String default value or None>"}, ...]
40 |         Raises: NotImplementedError if not implemented by Transform
41 |         '''
42 |         raise NotImplementedError("describeArgs method not implemented for Transform {}".format(cls.__name__))
43 | 
44 |     @classmethod
45 |     def describeReturn(cls):
46 |         '''
47 |         Returns: A dictionary with information about the return type,
48 |         in the following format:
49 |                 {"type": "<return type>",
50 |                 "description": "<description of output>"}
51 |         Raises: NotImplementedError if not implemented by Transform
52 |         '''
53 |         raise NotImplementedError("describeReturn method not implemented for Transform {}".format(cls.__name__))
54 | 
55 |     @classmethod
56 |     def describeTransform(cls):
57 |         '''
58 |         Returns: A string describing the transform, e.g.
59 |                 "Base class for all Glue Transforms"
60 |         Raises: NotImplementedError if not implemented by Transform
61 |         '''
62 | 
63 |         raise NotImplementedError("describeTransform method not implemented for Transform {}".format(cls.__name__))
64 | 
65 |     @classmethod
66 |     def describeErrors(cls):
67 |         '''
68 |         Returns: A list of dictionaries, each describing possible errors thrown by
69 |         this transform, in the following format:
70 |                 [{"type": "<type of error>",
71 |                  "description": "<description of error>"}]
72 |         Raises: NotImplementedError if not implemented by Transform
73 |         '''
74 |         raise NotImplementedError("describeErrors method not implemented for Transform {}".format(cls.__name__))
75 | 
76 |     @classmethod
77 |     def describe(cls):
78 |         return {"transform": {"name": cls.name(),
79 |                 "args": cls.describeArgs(),
80 |                 "returns": cls.describeReturn(),
81 |                 "description": cls.describeTransform(),
82 |                 "raises": cls.describeErrors(),
83 |                 "location": "internal"}}
84 | 
85 |     def __eq__(self, other):
86 |         return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
87 | 
88 |     def __hash__(self):
89 |         return hash(tuple(sorted(self.__dict__.items())))
90 | 
91 |     def __repr__(self):
92 |         return "<Transform: {}>".format(self.name())


--------------------------------------------------------------------------------
/awsglue/transforms/unbox.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | from awsglue.transforms import GlueTransform
 14 | 
 15 | class Unbox(GlueTransform):
 16 | 
 17 |     def __call__(self, frame, path, format, transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0, **options):
 18 |         """
 19 |         unbox a string field
 20 | 
 21 |         :param frame: dynamicFrame on which to call unbox
 22 |         :param path: full path to the StringNode you want to unbox
 23 |         :param format: "avro" or "json"
 24 |         :param info: String, any string to be associated with errors in this transformation.
 25 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
 26 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
 27 |           for which the processing needs to error out.
 28 |         :param options:
 29 |             separator: String,
 30 |             escaper: String,
 31 |             skipFirst: Boolean,
 32 |             withSchema: String, schema string should always be called by using StructType.json()
 33 |             withHeader: Boolean
 34 |         """
 35 |         return frame.unbox(path, format, transformation_ctx, info, stageThreshold, totalThreshold, **options)
 36 | 
 37 |     @classmethod
 38 |     def describeArgs(cls):
 39 |         arg1 = {"name": "frame",
 40 |                 "type": "DynamicFrame",
 41 |                 "description": "The DynamicFrame on which to call Unbox",
 42 |                 "optional": False,
 43 |                 "defaultValue": None}
 44 |         arg2 = {"name": "path",
 45 |                 "type": "String",
 46 |                 "description": "full path to the StringNode to unbox",
 47 |                 "optional": False,
 48 |                 "defaultValue": None}
 49 |         arg3 = {"name": "format",
 50 |                 "type": "String",
 51 |                 "description": "file format -- \"avro\" or \"json\" only",
 52 |                 "optional": False,
 53 |                 "defaultValue": None}
 54 |         arg4 = {"name": "transformation_ctx",
 55 |                 "type": "String",
 56 |                 "description": "A unique string that is used to identify stats / state information",
 57 |                 "optional": True,
 58 |                 "defaultValue": ""}
 59 |         arg5 = {"name": "info",
 60 |                 "type": "String",
 61 |                 "description": "Any string to be associated with errors in the transformation",
 62 |                 "optional": True,
 63 |                 "defaultValue": "\"\""}
 64 |         arg6 = {"name": "stageThreshold",
 65 |                 "type": "Integer",
 66 |                 "description": "Max number of errors in the transformation until processing will error out",
 67 |                 "optional": True,
 68 |                 "defaultValue": "0"}
 69 |         arg7 = {"name": "totalThreshold",
 70 |                 "type": "Integer",
 71 |                 "description": "Max number of errors total until processing will error out.",
 72 |                 "optional": True,
 73 |                 "defaultValue": "0"}
 74 |         arg8 = {"name": "separator",
 75 |                 "type": "String",
 76 |                 "description": "separator token",
 77 |                 "optional": True,
 78 |                 "defaultValue": "None, but individual readers may have their own defaults"}
 79 |         arg9 = {"name": "escaper",
 80 |                 "type": "String",
 81 |                 "description": "escape token",
 82 |                 "optional": True,
 83 |                 "defaultValue": "None, but individual readers may have their own defaults"}
 84 |         arg10 = {"name": "skipFirst",
 85 |                 "type": "Boolean",
 86 |                 "description": "whether to skip the first line of data",
 87 |                 "optional": True,
 88 |                 "defaultValue": "None, but individual readers may have their own defaults"}
 89 |         arg11 = {"name": "withSchema",
 90 |                 "type": "String",
 91 |                 "description":"schema for data to unbox, should always be created by using StructType.json()",
 92 |                 "optional": True,
 93 |                 "defaultValue": "None, but individual readers may have their own defaults"}
 94 |         arg12 = {"name": "withHeader",
 95 |                 "type": "Boolean",
 96 |                 "description": "whether data being unpacked includes a header",
 97 |                 "optional": True,
 98 |                 "defaultValue": "None, but individual readers may have their own defaults"}
 99 |         return [arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12]
100 | 
101 |     @classmethod
102 |     def describeTransform(cls):
103 |         return "unbox a string field"
104 | 
105 |     @classmethod
106 |     def describeErrors(cls):
107 |         return []
108 | 
109 |     @classmethod
110 |     def describeReturn(cls):
111 |         return {"type": "DynamicFrame",
112 |                 "description": "new DynamicFrame with unboxed DynamicRecords"}
113 | 


--------------------------------------------------------------------------------
/awsglue/transforms/union.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class Union(GlueTransform):
16 |     def __call__(self, frame1, frame2, transformation_ctx = "",
17 |                  info = "", stageThreshold = 0, totalThreshold = 0):
18 |         return frame1.union(mappings, case_sensitive)
19 | 
20 |     @classmethod
21 |     def describeArgs(cls):
22 |         arg1 = {"name": "frame1",
23 |                 "type": "DynamicFrame",
24 |                 "description": "First DynamicFrame to union.",
25 |                 "optional": False,
26 |                 "defaultValue": None}
27 |         arg2 = {"name": "frame2",
28 |                 "type": "DynamicFrame",
29 |                 "description": "Second DynamicFrame to union.",
30 |                 "optional": False,
31 |                 "defaultValue": None}
32 |         arg3 = {"name": "transformation_ctx",
33 |                 "type": "String",
34 |                 "description": "A unique string that is used to identify stats / state information",
35 |                 "optional": True,
36 |                 "defaultValue": ""}
37 |         arg4 = {"name": "info",
38 |                 "type": "String",
39 |                 "description": "Any string to be associated with errors in the transformation",
40 |                 "optional": True,
41 |                 "defaultValue": "\"\""}
42 |         arg5 = {"name": "stageThreshold",
43 |                 "type": "Integer",
44 |                 "description": "Max number of errors in the transformation until processing will error out",
45 |                 "optional": True,
46 |                 "defaultValue": "0"}
47 |         arg6 = {"name": "totalThreshold",
48 |                 "type": "Integer",
49 |                 "description": "Max number of errors total until processing will error out.",
50 |                 "optional": True,
51 |                 "defaultValue": "0"}
52 | 
53 |         return [arg1, arg2, arg3, arg4, arg5, arg6]
54 | 
55 |     @classmethod
56 |     def describeTransform(cls):
57 |         return "Union two DynamicFrames."
58 | 
59 |     @classmethod
60 |     def describeErrors(cls):
61 |         return []
62 | 
63 |     @classmethod
64 |     def describeReturn(cls):
65 |         return {"type": "DynamicFrame",
66 |                 "description": "DynamicFrame containing all records from both input DynamicFrames."}
67 | 


--------------------------------------------------------------------------------
/awsglue/transforms/unnest_frame.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # Licensed under the Amazon Software License (the "License"). You may not use
 3 | # this file except in compliance with the License. A copy of the License is
 4 | # located at
 5 | #
 6 | #  http://aws.amazon.com/asl/
 7 | #
 8 | # or in the "license" file accompanying this file. This file is distributed
 9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10 | # or implied. See the License for the specific language governing
11 | # permissions and limitations under the License.
12 | 
13 | from awsglue.transforms import GlueTransform
14 | 
15 | class UnnestFrame(GlueTransform):
16 |     """
17 |     unnest a dynamic frame. i.e. flattens nested objects to top level elements.
18 |     It also generates joinkeys for array objects
19 |     """
20 | 
21 |     def __call__(self, frame, transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
22 |         """
23 |         unnest a dynamic frame. i.e. flattens nested objects to top level elements.
24 |         It also generates joinkeys for array objects
25 |         :param frame: DynamicFrame, the dynamicframe to unnest
26 |         :param info: String, any string to be associated with errors in this transformation.
27 |         :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
28 |         :param totalThreshold: Long, total number of errors upto and including in this transformation
29 |           for which the processing needs to error out.
30 |         :return: a new unnested dynamic frame
31 |         """
32 |         return frame.unnest(transformation_ctx, info, stageThreshold, totalThreshold)
33 | 
34 |     @classmethod
35 |     def describeArgs(cls):
36 |         arg1 = {"name": "frame",
37 |                 "type": "DynamicFrame",
38 |                 "description": "The DynamicFrame to unnest",
39 |                 "optional": False,
40 |                 "defaultValue": None}
41 |         arg2 = {"name": "transformation_ctx",
42 |                 "type": "String",
43 |                 "description": "A unique string that is used to identify stats / state information",
44 |                 "optional": True,
45 |                 "defaultValue": ""}
46 |         arg3 = {"name": "info",
47 |                 "type": "String",
48 |                 "description": "Any string to be associated with errors in the transformation",
49 |                 "optional": True,
50 |                 "defaultValue": "\"\""}
51 |         arg4 = {"name": "stageThreshold",
52 |                 "type": "Integer",
53 |                 "description": "Max number of errors in the transformation until processing will error out",
54 |                 "optional": True,
55 |                 "defaultValue": "0"}
56 |         arg5 = {"name": "totalThreshold",
57 |                 "type": "Integer",
58 |                 "description": "Max number of errors total until processing will error out.",
59 |                 "optional": True,
60 |                 "defaultValue": "0"}
61 | 
62 |         return [arg1, arg2, arg3, arg4, arg5]
63 | 
64 |     @classmethod
65 |     def describeTransform(cls):
66 |         return "unnest a dynamic frame. i.e. flatten nested objects to top level elements."
67 | 
68 |     @classmethod
69 |     def describeErrors(cls):
70 |         return []
71 | 
72 |     @classmethod
73 |     def describeReturn(cls):
74 |         return {"type": "DynamicFrame",
75 |                 "description": "new unnested DynamicFrame"}
76 | 


--------------------------------------------------------------------------------
/awsglue/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # Licensed under the Amazon Software License (the "License"). You may not use
  3 | # this file except in compliance with the License. A copy of the License is
  4 | # located at
  5 | #
  6 | #  http://aws.amazon.com/asl/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is distributed
  9 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
 10 | # or implied. See the License for the specific language governing
 11 | # permissions and limitations under the License.
 12 | 
 13 | import argparse
 14 | import json
 15 | import traceback
 16 | import sys
 17 | from awsglue.job import Job
 18 | 
 19 | _global_args = {}
 20 | 
 21 | def makeOptions(sc, py_obj):
 22 |     if isinstance(py_obj, dict):
 23 |         json_string = json.dumps(py_obj)
 24 |     elif isinstance(py_obj, basestring):
 25 |         json_string = py_obj
 26 |     else:
 27 |         raise TypeError("Unexpected type " + str(type(py_obj))
 28 |                         + " in makeOptions")
 29 |     return sc._jvm.JsonOptions(json_string)
 30 | 
 31 | 
 32 | def _call_site(sc, call_site, info):
 33 |     return sc._jvm.CallSite(call_site, info)
 34 | 
 35 | 
 36 | def _as_java_list(sc, scala_seq_obj):
 37 |     return sc._jvm.GluePythonUtils.seqAsJava(scala_seq_obj)
 38 | 
 39 | 
 40 | def _as_scala_option(sc, some_val):
 41 |     return sc._jvm.GluePythonUtils.constructOption(some_val)
 42 | 
 43 | 
 44 | def _as_resolve_choiceOption(sc, choice_option_str):
 45 |     return sc._jvm.GluePythonUtils.constructChoiceOption(choice_option_str)
 46 | 
 47 | 
 48 | def callsite():
 49 |     return "".join(traceback.format_list(traceback.extract_stack()[:-2]))
 50 | 
 51 | 
 52 | # Definitions for Python 2/Python 3
 53 | if sys.version >= "3":
 54 |     def iteritems(d, **kwargs):
 55 |         return iter(d.items(**kwargs))
 56 |     def iterkeys(d, **kwargs):
 57 |         return iter(d.values(**kwargs))
 58 |     def itervalues(d, **kwargs):
 59 |         return iter(d.values(**kwargs))
 60 | else:
 61 |     def iteritems(d, **kwargs):
 62 |         return d.iteritems(**kwargs)
 63 |     def iterkeys(d, **kwargs):
 64 |         return d.iterkeys(**kwargs)
 65 |     def itervalues(d, **kwargs):
 66 |         return d.itervalues(**kwargs)
 67 | 
 68 | class GlueArgumentError(Exception):
 69 |     pass
 70 | 
 71 | 
 72 | # Define a custom argument parser that raises an exception rather than calling
 73 | # sys.exit() so that we can surface the errors.
 74 | class GlueArgumentParser(argparse.ArgumentParser):
 75 |     def error(self, msg):
 76 |         raise GlueArgumentError(msg)
 77 | 
 78 | 
 79 | def getResolvedOptions(args, options):
 80 |     parser = GlueArgumentParser()
 81 | 
 82 |     if Job.continuation_options()[0][2:] in options:
 83 |         raise RuntimeError("Using reserved arguments " + Job.continuation_options()[0][2:])
 84 | 
 85 |     if Job.job_bookmark_options()[0][2:] in options:
 86 |         raise RuntimeError("Using reserved arguments " + Job.job_bookmark_options()[0][2:])
 87 | 
 88 |     parser.add_argument(Job.job_bookmark_options()[0], choices =Job.job_bookmark_options()[1:], required = False)
 89 |     parser.add_argument(Job.continuation_options()[0], choices =Job.continuation_options()[1:], required = False)
 90 | 
 91 |     for option in Job.job_bookmark_range_options():
 92 |         if option[2:] in options:
 93 |             raise RuntimeError("Using reserved arguments " + option)
 94 |         parser.add_argument(option, required=False)
 95 | 
 96 |     for option in Job.id_params()[1:]:
 97 |         if option in options:
 98 |             raise RuntimeError("Using reserved arguments " + option)
 99 |         # TODO: Make these mandatory, for now for backward compatability making these optional, also not including JOB_NAME in the reserved parameters list.
100 |         parser.add_argument(option, required=False)
101 | 
102 |     if Job.encryption_type_options()[0] in options:
103 |         raise RuntimeError("Using reserved arguments " + Job.encryption_type_options()[0])
104 |     parser.add_argument(Job.encryption_type_options()[0], choices = Job.encryption_type_options()[1:])
105 | 
106 |     if Job.data_lineage_options()[0] in options:
107 |         raise RuntimeError("Using reserved arguments " + Job.data_lineage_options()[0])
108 |     parser.add_argument(Job.data_lineage_options()[0], required=False)
109 |         
110 |     # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir' after TempDir is made mandatory for all Jobs
111 |     # Remove 'RedshiftTempDir' and 'TempDir' from list of user supplied options
112 |     options = [opt for opt in options if opt not in {'RedshiftTempDir', 'TempDir'}]
113 |     parser.add_argument('--RedshiftTempDir', required=False)
114 |     parser.add_argument('--TempDir', required=False)
115 | 
116 |     for option in options:
117 |         parser.add_argument('--' + option, required=True)
118 | 
119 |     parsed, extra = parser.parse_known_args(args[1:])
120 | 
121 |     parsed_dict = vars(parsed)
122 | 
123 |     # TODO: remove special handling after TempDir is made mandatory for all jobs
124 |     if 'TempDir' in parsed_dict and parsed_dict['TempDir'] is not None:
125 |         # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir'
126 |         parsed_dict['RedshiftTempDir'] = parsed_dict['TempDir']
127 |     elif 'RedshiftTempDir' in parsed and parsed_dict['RedshiftTempDir'] is not None:
128 |         parsed_dict['TempDir'] = parsed_dict['RedshiftTempDir']
129 | 
130 |     # Special handling for continuations. If --job-bookmark-option is set we
131 |     # use that, regardless of whether --continuation-option is set. If
132 |     # --job-bookmark-option is not set but --continuation-option is set, fall
133 |     # back to that.
134 | 
135 |     bookmark_value = parsed_dict.pop("continuation_option", None)
136 |     if 'job_bookmark_option' not in parsed_dict or parsed_dict['job_bookmark_option'] is None:
137 |         if bookmark_value is None:
138 |             bookmark_value = Job.job_bookmark_options()[3]
139 |         else:
140 |             # translate old style continuation options into job-bookmark options
141 |             option_index = Job.continuation_options().index(bookmark_value)
142 |             bookmark_value = Job.job_bookmark_options()[option_index]
143 | 
144 |         parsed_dict['job_bookmark_option'] = bookmark_value
145 |     absent_range_option = []
146 |     for option in Job.job_bookmark_range_options():
147 |        key = option[2:].replace('-','_')
148 |        if key not in parsed_dict or parsed_dict[key] is None:
149 |            absent_range_option.append(option)
150 |     if parsed_dict['job_bookmark_option']  == 'job-bookmark-pause':
151 |         if len(absent_range_option) == 1:
152 |             raise RuntimeError("Missing option or value for "  +  absent_range_option[0])
153 |     else:
154 |         if len(absent_range_option) == 0:
155 |             raise RuntimeError("Invalid option(s)"  +  ' '.join(Job.job_bookmark_range_options()))
156 | 
157 |     _global_args.update(parsed_dict)
158 | 
159 |     return parsed_dict
160 | 


--------------------------------------------------------------------------------
/bin/glue-setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ROOT_DIR="$(cd $(dirname "$0")/..; pwd)"
 4 | cd $ROOT_DIR
 5 | 
 6 | SPARK_CONF_DIR=$ROOT_DIR/conf
 7 | GLUE_JARS_DIR=$ROOT_DIR/jarsv1
 8 | 
 9 | PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
10 | PYTHONPATH=`ls $SPARK_HOME/python/lib/py4j-*-src.zip`:"$PYTHONPATH"
11 | 
12 | # Generate the zip archive for glue python modules
13 | rm PyGlue.zip
14 | zip -r PyGlue.zip awsglue
15 | GLUE_PY_FILES="$ROOT_DIR/PyGlue.zip"
16 | export PYTHONPATH="$GLUE_PY_FILES:$PYTHONPATH"
17 | 
18 | # Run mvn copy-dependencies target to get the Glue dependencies locally
19 | mvn -f $ROOT_DIR/pom.xml -DoutputDirectory=$ROOT_DIR/jarsv1 dependency:copy-dependencies
20 | 
21 | export SPARK_CONF_DIR=${ROOT_DIR}/conf
22 | mkdir $SPARK_CONF_DIR
23 | rm $SPARK_CONF_DIR/spark-defaults.conf
24 | # Generate spark-defaults.conf
25 | echo "spark.driver.extraClassPath $SPARK_HOME/jars/*:$GLUE_JARS_DIR/*" >> $SPARK_CONF_DIR/spark-defaults.conf
26 | echo "spark.executor.extraClassPath $SPARK_HOME/jars/*:$GLUE_JARS_DIR/*" >> $SPARK_CONF_DIR/spark-defaults.conf
27 | 
28 | # Restore present working directory
29 | cd -
30 | 


--------------------------------------------------------------------------------
/bin/gluepyspark:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ROOT_DIR="$(cd $(dirname "$0")/..; pwd)"
4 | source $ROOT_DIR/bin/glue-setup.sh
5 | exec "${SPARK_HOME}/bin/pyspark" "$@"
6 | 


--------------------------------------------------------------------------------
/bin/gluepytest:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ROOT_DIR="$(cd $(dirname "$0")/..; pwd)"
4 | source $ROOT_DIR/bin/glue-setup.sh
5 | exec pytest "$@"
6 | 


--------------------------------------------------------------------------------
/bin/gluesparksubmit:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ROOT_DIR="$(cd $(dirname "$0")/..; pwd)"
4 | source $ROOT_DIR/bin/glue-setup.sh
5 | exec "${SPARK_HOME}/bin/spark-submit" --py-files "${GLUE_PY_FILES}" "$@"
6 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>com.amazonaws</groupId>
 4 |   <artifactId>AWSGlueETLPython</artifactId>
 5 |   <version>4.0.0</version>
 6 |   <name>${project.artifactId}</name>
 7 |   <description>Python library for AWS Glue ETL libraries</description>
 8 |   <licenses>
 9 |     <license>
10 |       <name>Amazon Software License</name>
11 |       <url>http://aws.amazon.com/asl/</url>
12 |       <distribution>repo</distribution>
13 |     </license>
14 |   </licenses>
15 | 
16 |   <dependencies>
17 |     <dependency>
18 |       <groupId>com.amazonaws</groupId>
19 |       <artifactId>AWSGlueETL</artifactId>
20 |       <version>${project.version}</version>
21 |     </dependency>
22 |   </dependencies>
23 |   <repositories>
24 |     <repository>
25 |       <id>aws-glue-etl-artifacts</id>
26 |       <url>https://aws-glue-etl-artifacts.s3.amazonaws.com/release/</url>
27 |     </repository>
28 |   </repositories>
29 | 
30 |   <build>
31 |     <plugins>
32 |       <plugin>
33 |         <groupId>org.apache.maven.plugins</groupId>
34 |         <artifactId>maven-enforcer-plugin</artifactId>
35 |         <version>3.0.0-M2</version>
36 |         <executions>
37 |           <execution>
38 |             <id>enforce-maven</id>
39 |             <goals>
40 |               <goal>enforce</goal>
41 |             </goals>
42 |             <configuration>
43 |               <rules>
44 |                 <requireMavenVersion>
45 |                   <version>3.5.3</version>
46 |                 </requireMavenVersion>
47 |               </rules>
48 |             </configuration>
49 |           </execution>
50 |         </executions>
51 |       </plugin>
52 |     </plugins>
53 |   </build>
54 | </project>
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='aws-glue-libs',
 5 |     version='4.0.0',
 6 |     long_description=__doc__,
 7 |     packages=find_packages(),
 8 |     include_package_data=True,
 9 |     zip_safe=False,
10 |     license='Amazon Software License 1.0',
11 | )
12 | 


--------------------------------------------------------------------------------