├── .adr-dir
├── .bumpversion.toml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── enhancement-request.md
    │   ├── feature_request.md
    │   └── question.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── bandit.yml
    │   ├── cfn-nag.yml
    │   ├── check-pytest-xfails.yml
    │   ├── dependabot-prs.yml
    │   ├── git-hygiene.yml
    │   ├── minimal-tests.yml
    │   ├── minimum-response-time.yml
    │   ├── pr-linter.yml
    │   ├── snyk.yml
    │   ├── static-checking.yml
    │   └── unlabel-assigned-issue.yml
├── .gitignore
├── .readthedocs.yml
├── .snyk
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── CONTRIBUTING_COMMON_ERRORS.md
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── THIRD_PARTY.txt
├── VERSION
├── adr
    ├── 0001-record-architecture-decisions.md
    ├── 0002-handling-unsupported-arguments-in-distributed-mode.md
    ├── 0003-use-typeddict-to-group-similar-parameters.md
    ├── 0004-no-alter-iam-permissions.md
    ├── 0005-move-dependencies-to-optional.md
    ├── 0006-deprecate-s3-merge-upsert-table.md
    ├── 0007-design-of-engine-and-memory-format.md
    ├── 0008-switching-between-pyarrow-and-pandas-based-datasources-for-csv-json-i-o.md
    └── 0009-lazy-engine-initialization.md
├── awswrangler
    ├── __init__.py
    ├── __metadata__.py
    ├── _arrow.py
    ├── _config.py
    ├── _data_types.py
    ├── _databases.py
    ├── _distributed.py
    ├── _executor.py
    ├── _sql_formatter.py
    ├── _sql_utils.py
    ├── _utils.py
    ├── annotations.py
    ├── athena
    │   ├── __init__.py
    │   ├── _cache.py
    │   ├── _executions.py
    │   ├── _executions.pyi
    │   ├── _read.py
    │   ├── _read.pyi
    │   ├── _spark.py
    │   ├── _statements.py
    │   ├── _utils.py
    │   └── _write_iceberg.py
    ├── catalog
    │   ├── __init__.py
    │   ├── _add.py
    │   ├── _create.py
    │   ├── _definitions.py
    │   ├── _delete.py
    │   ├── _get.py
    │   └── _utils.py
    ├── chime.py
    ├── cleanrooms
    │   ├── __init__.py
    │   ├── _read.py
    │   └── _utils.py
    ├── cloudwatch.py
    ├── data_api
    │   ├── __init__.py
    │   ├── _connector.py
    │   ├── rds.py
    │   └── redshift.py
    ├── data_quality
    │   ├── __init__.py
    │   ├── _create.py
    │   ├── _get.py
    │   └── _utils.py
    ├── distributed
    │   ├── __init__.py
    │   └── ray
    │   │   ├── __init__.py
    │   │   ├── _core.py
    │   │   ├── _core.pyi
    │   │   ├── _executor.py
    │   │   ├── _register.py
    │   │   ├── _utils.py
    │   │   ├── datasources
    │   │       ├── __init__.py
    │   │       ├── arrow_csv_datasink.py
    │   │       ├── arrow_csv_datasource.py
    │   │       ├── arrow_json_datasource.py
    │   │       ├── arrow_orc_datasink.py
    │   │       ├── arrow_orc_datasource.py
    │   │       ├── arrow_parquet_base_datasource.py
    │   │       ├── arrow_parquet_datasink.py
    │   │       ├── arrow_parquet_datasource.py
    │   │       ├── file_datasink.py
    │   │       ├── filename_provider.py
    │   │       ├── pandas_text_datasink.py
    │   │       └── pandas_text_datasource.py
    │   │   ├── modin
    │   │       ├── __init__.py
    │   │       ├── _core.py
    │   │       ├── _data_types.py
    │   │       ├── _utils.py
    │   │       └── s3
    │   │       │   ├── __init__.py
    │   │       │   ├── _read_orc.py
    │   │       │   ├── _read_parquet.py
    │   │       │   ├── _read_text.py
    │   │       │   ├── _write_dataset.py
    │   │       │   ├── _write_orc.py
    │   │       │   ├── _write_parquet.py
    │   │       │   └── _write_text.py
    │   │   └── s3
    │   │       ├── __init__.py
    │   │       ├── _list.py
    │   │       ├── _read_orc.py
    │   │       └── _read_parquet.py
    ├── dynamodb
    │   ├── __init__.py
    │   ├── _delete.py
    │   ├── _read.py
    │   ├── _read.pyi
    │   ├── _utils.py
    │   └── _write.py
    ├── emr.py
    ├── emr_serverless.py
    ├── exceptions.py
    ├── mysql.py
    ├── neptune
    │   ├── __init__.py
    │   ├── _client.py
    │   ├── _gremlin_init.py
    │   ├── _gremlin_parser.py
    │   ├── _neptune.py
    │   └── _utils.py
    ├── opensearch
    │   ├── __init__.py
    │   ├── _read.py
    │   ├── _utils.py
    │   └── _write.py
    ├── oracle.py
    ├── pandas
    │   └── __init__.py
    ├── postgresql.py
    ├── py.typed
    ├── quicksight
    │   ├── __init__.py
    │   ├── _cancel.py
    │   ├── _create.py
    │   ├── _delete.py
    │   ├── _describe.py
    │   ├── _get_list.py
    │   └── _utils.py
    ├── redshift
    │   ├── __init__.py
    │   ├── _connect.py
    │   ├── _read.py
    │   ├── _read.pyi
    │   ├── _utils.py
    │   └── _write.py
    ├── s3
    │   ├── __init__.py
    │   ├── _copy.py
    │   ├── _delete.py
    │   ├── _describe.py
    │   ├── _download.py
    │   ├── _fs.py
    │   ├── _list.py
    │   ├── _list.pyi
    │   ├── _read.py
    │   ├── _read_deltalake.py
    │   ├── _read_excel.py
    │   ├── _read_orc.py
    │   ├── _read_parquet.py
    │   ├── _read_parquet.pyi
    │   ├── _read_text.py
    │   ├── _read_text.pyi
    │   ├── _read_text_core.py
    │   ├── _select.py
    │   ├── _upload.py
    │   ├── _wait.py
    │   ├── _write.py
    │   ├── _write_concurrent.py
    │   ├── _write_dataset.py
    │   ├── _write_deltalake.py
    │   ├── _write_excel.py
    │   ├── _write_orc.py
    │   ├── _write_parquet.py
    │   └── _write_text.py
    ├── secretsmanager.py
    ├── sqlserver.py
    ├── sts.py
    ├── timestream
    │   ├── __init__.py
    │   ├── _create.py
    │   ├── _delete.py
    │   ├── _list.py
    │   ├── _read.py
    │   ├── _read.pyi
    │   └── _write.py
    └── typing.py
├── building
    ├── build-docs.sh
    ├── build-lambda-layers.sh
    ├── build-wheel.sh
    ├── lambda
    │   ├── Dockerfile
    │   ├── Dockerfile.al2023
    │   ├── build-docker-images.sh
    │   └── build-lambda-layer.sh
    ├── publish.sh
    └── update-glue-lib.sh
├── docs
    ├── Makefile
    ├── environment.yml
    └── source
    │   ├── _ext
    │       ├── copy_adr.py
    │       └── copy_tutorials.py
    │   ├── _static
    │       ├── aws_lambda_managed_layer.png
    │       ├── css
    │       │   └── max_width.css
    │       ├── favicon.ico
    │       ├── logo.png
    │       ├── logo2.png
    │       ├── logo_transparent.png
    │       ├── logo_transparent_small.png
    │       └── ssm_public_parameters.png
    │   ├── _templates
    │       ├── globaltoc.html
    │       └── typed-dict-template.rst
    │   ├── about.rst
    │   ├── adr.rst
    │   ├── adr
    │       └── .gitignore
    │   ├── api.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── install.rst
    │   ├── layers.rst
    │   ├── scale.rst
    │   ├── tutorials.rst
    │   └── tutorials
    │       └── .gitignore
├── fix.sh
├── poetry.lock
├── pyproject.toml
├── test.sh
├── test_infra
    ├── app.py
    ├── cdk.json
    ├── poetry.lock
    ├── pyproject.toml
    ├── scripts
    │   ├── delete-stack.sh
    │   ├── deploy-stack.sh
    │   ├── security-group-databases-add-local-ip.sh
    │   └── security-group-databases-check.sh
    ├── source.bat
    └── stacks
    │   ├── __init__.py
    │   ├── base_stack.py
    │   ├── cleanrooms_stack.py
    │   ├── databases_stack.py
    │   ├── glueray_stack.py
    │   └── opensearch_stack.py
├── tests
    ├── __init__.py
    ├── _utils.py
    ├── benchmark
    │   ├── __init__.py
    │   └── test_glueray.py
    ├── conftest.py
    ├── glue_scripts
    │   ├── ray_read_small_parquet.py
    │   ├── wrangler_blog_simple.py
    │   ├── wrangler_read_small_parquet.py
    │   └── wrangler_write_partitioned_parquet.py
    ├── load
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_databases.py
    │   ├── test_dynamodb.py
    │   ├── test_s3.py
    │   └── test_s3_modin.py
    └── unit
    │   ├── __init__.py
    │   ├── test_athena.py
    │   ├── test_athena_cache.py
    │   ├── test_athena_csv.py
    │   ├── test_athena_geospatial.py
    │   ├── test_athena_iceberg.py
    │   ├── test_athena_parquet.py
    │   ├── test_athena_prepared.py
    │   ├── test_athena_projection.py
    │   ├── test_athena_spark.py
    │   ├── test_catalog.py
    │   ├── test_chime.py
    │   ├── test_cleanrooms.py
    │   ├── test_cloudwatch.py
    │   ├── test_config.py
    │   ├── test_data_api.py
    │   ├── test_data_quality.py
    │   ├── test_distributed.py
    │   ├── test_dynamodb.py
    │   ├── test_emr.py
    │   ├── test_emr_serverless.py
    │   ├── test_fs.py
    │   ├── test_glue.py
    │   ├── test_metadata.py
    │   ├── test_moto.py
    │   ├── test_mysql.py
    │   ├── test_neptune.py
    │   ├── test_neptune_parsing.py
    │   ├── test_opensearch.py
    │   ├── test_oracle.py
    │   ├── test_pandas_pyarrow_dtype_backend.py
    │   ├── test_postgresql.py
    │   ├── test_quicksight.py
    │   ├── test_redshift.py
    │   ├── test_routines.py
    │   ├── test_s3.py
    │   ├── test_s3_deltalake.py
    │   ├── test_s3_excel.py
    │   ├── test_s3_orc.py
    │   ├── test_s3_parquet.py
    │   ├── test_s3_select.py
    │   ├── test_s3_text.py
    │   ├── test_s3_text_compressed.py
    │   ├── test_s3_wait.py
    │   ├── test_session.py
    │   ├── test_sql_params_formatter.py
    │   ├── test_sqlserver.py
    │   ├── test_timestream.py
    │   └── test_utils.py
├── tox.ini
├── tutorials
    ├── 001 - Introduction.ipynb
    ├── 002 - Sessions.ipynb
    ├── 003 - Amazon S3.ipynb
    ├── 004 - Parquet Datasets.ipynb
    ├── 005 - Glue Catalog.ipynb
    ├── 006 - Amazon Athena.ipynb
    ├── 007 - Redshift, MySQL, PostgreSQL, SQL Server, Oracle.ipynb
    ├── 008 - Redshift - Copy & Unload.ipynb
    ├── 009 - Redshift - Append, Overwrite, Upsert.ipynb
    ├── 010 - Parquet Crawler.ipynb
    ├── 011 - CSV Datasets.ipynb
    ├── 012 - CSV Crawler.ipynb
    ├── 013 - Merging Datasets on S3.ipynb
    ├── 014 - Schema Evolution.ipynb
    ├── 015 - EMR.ipynb
    ├── 016 - EMR & Docker.ipynb
    ├── 017 - Partition Projection.ipynb
    ├── 018 - QuickSight.ipynb
    ├── 019 - Athena Cache.ipynb
    ├── 020 - Spark Table Interoperability.ipynb
    ├── 021 - Global Configurations.ipynb
    ├── 022 - Writing Partitions Concurrently.ipynb
    ├── 023 - Flexible Partitions Filter.ipynb
    ├── 024 - Athena Query Metadata.ipynb
    ├── 025 - Redshift - Loading Parquet files with Spectrum.ipynb
    ├── 026 - Amazon Timestream.ipynb
    ├── 027 - Amazon Timestream 2.ipynb
    ├── 028 - DynamoDB.ipynb
    ├── 029 - S3 Select.ipynb
    ├── 030 - Data Api.ipynb
    ├── 031 - OpenSearch.ipynb
    ├── 033 - Amazon Neptune.ipynb
    ├── 034 - Distributing Calls using Ray.ipynb
    ├── 035 - Distributing Calls on Ray Remote Cluster.ipynb
    ├── 036 - Distributing Calls with Glue Interactive Sessions on Ray.ipynb
    ├── 037 - Glue Data Quality.ipynb
    ├── 038 - OpenSearch Serverless.ipynb
    ├── 039 - Athena Iceberg.ipynb
    ├── 040 - EMR Serverless.ipynb
    ├── 041 - Apache Spark on Amazon Athena.ipynb
    └── _static
    │   ├── glue_catalog_table_products.png
    │   ├── glue_catalog_version_0.png
    │   ├── glue_catalog_version_1.png
    │   ├── glue_is_create.png
    │   ├── glue_is_setup.png
    │   └── logo.png
└── validate.sh


/.adr-dir:
--------------------------------------------------------------------------------
1 | adr
2 | 


--------------------------------------------------------------------------------
/.bumpversion.toml:
--------------------------------------------------------------------------------
 1 | [tool.bumpversion]
 2 | current_version = "3.12.0"
 3 | commit = false
 4 | tag = false
 5 | tag_name = "{new_version}"
 6 | parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)((?P<release>[a-z]+)(?P<build>\\d+))?"
 7 | serialize = [
 8 | 	"{major}.{minor}.{patch}{release}{build}",
 9 | 	"{major}.{minor}.{patch}"
10 | ]
11 | 
12 | [tool.bumpversion.parts.release]
13 | optional_value = "rc"
14 | values = [
15 | 	"a",
16 | 	"b",
17 | 	"rc"
18 | ]
19 | 
20 | [tool.bumpversion.parts.build]
21 | first_value = 1
22 | 
23 | [[tool.bumpversion.files]]
24 | filename = "VERSION"
25 | 
26 | [[tool.bumpversion.files]]
27 | filename = "pyproject.toml"
28 | search = "version = \"{current_version}\""
29 | replace = "version = \"{new_version}\""
30 | 
31 | [[tool.bumpversion.files]]
32 | filename = "test_infra/pyproject.toml"
33 | search = "version = \"{current_version}\""
34 | replace = "version = \"{new_version}\""
35 | 
36 | [[tool.bumpversion.files]]
37 | filename = "README.md"
38 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/"
39 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/"
40 | 
41 | [[tool.bumpversion.files]]
42 | filename = "docs/source/install.rst"
43 | search = "awswrangler=={current_version}"
44 | replace = "awswrangler=={new_version}"
45 | 
46 | [[tool.bumpversion.files]]
47 | filename = "awswrangler/__metadata__.py"
48 | 
49 | [[tool.bumpversion.files]]
50 | filename = "tests/unit/test_metadata.py"
51 | search = "assert wr.__version__ == \"{current_version}\""
52 | replace = "assert wr.__version__ == \"{new_version}\""
53 | 
54 | [[tool.bumpversion.files]]
55 | glob = "awswrangler/**/*.py"
56 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/"
57 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/"
58 | ignore_missing_version = true
59 | 
60 | [[tool.bumpversion.files]]
61 | glob = "tutorials/*.ipynb"
62 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/"
63 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/"
64 | ignore_missing_version = true
65 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug report
 2 | description: Create a report to help us improve.
 3 | labels: "bug"
 4 | 
 5 | body:
 6 |   - type: textarea
 7 |     attributes:
 8 |       label: Describe the bug
 9 |       description: >-
10 |         A clear description of what the bug is. Include a stack trace if present.
11 |     validations:
12 |       required: true
13 | 
14 |   - type: textarea
15 |     attributes:
16 |       label: How to Reproduce
17 |       description: Steps to reproduce the behavior.
18 |       value: |
19 |         ```
20 |         *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.*
21 |         ```
22 |     validations:
23 |       required: true
24 | 
25 |   - type: textarea
26 |     attributes:
27 |       label: Expected behavior
28 |       description: >-
29 |         A clear and concise description of what you expected to happen.
30 | 
31 |   - type: input
32 |     attributes:
33 |       label: Your project
34 |       description: >-
35 |         Link to your project.
36 |     validations:
37 |       required: false
38 | 
39 |   - type: textarea
40 |     attributes:
41 |       label: Screenshots
42 |       description: >-
43 |         If applicable, add screenshots to help explain your problem.
44 |     validations:
45 |       required: false
46 | 
47 |   - type: input
48 |     attributes:
49 |       label: OS
50 |       description: >-
51 |         [e.g. Unix/Linux/Mac/Win/other with version]
52 |     validations:
53 |       required: true
54 |   - type: input
55 |     attributes:
56 |       label: Python version
57 |     validations:
58 |       required: true
59 |   - type: input
60 |     attributes:
61 |       label: AWS SDK for pandas version
62 |     validations:
63 |       required: true
64 |   - type: textarea
65 |     attributes:
66 |       label: Additional context
67 |       description: >-
68 |         Add any other context about the problem here.
69 |         [e.g. URL or Ticket]


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | contact_links:
2 | - name: Discussion
3 |   url: https://join.slack.com/t/aws-sdk-pandas/shared_invite/zt-sxdx38sl-E0coRfAds8WdpxXD2Nzfrg
4 |   about: Public Slack channel for the AWS SDK for pandas community.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Enhancement request
 3 | about: Suggest an idea to enhance some existing feature
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your idea related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you are expecting.
15 | 
16 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.*
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: '<short description for the feature>'
 5 | labels: feature
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you are expecting.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 
22 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.*


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask with as many useful details as possible
 4 | title: ''
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.*
11 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ### Feature or Bugfix
 2 | <!-- please choose -->
 3 | - Feature
 4 | - Bugfix
 5 | - Refactoring
 6 | 
 7 | ### Detail
 8 | - <feature1 or bug1>
 9 | - <feature2 or bug2>
10 | 
11 | ### Relates
12 | - <URL or Ticket>
13 | 
14 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
15 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |       time: "09:00"
 8 |       timezone: "Europe/London"
 9 |     groups:
10 |         production-dependencies:
11 |           dependency-type: "production"
12 |         development-dependencies:
13 |           dependency-type: "development"
14 | 
15 |   - package-ecosystem: "github-actions"
16 |     directory: "/"
17 |     schedule:
18 |     schedule:
19 |       interval: "weekly"
20 |       time: "09:00"
21 |       timezone: "Europe/London"
22 |     groups:
23 |       github-actions:
24 |         patterns:
25 |           - "*"
26 | 


--------------------------------------------------------------------------------
/.github/workflows/bandit.yml:
--------------------------------------------------------------------------------
 1 | name: Bandit
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |     - main
 8 |   pull_request:
 9 |     branches:
10 |     - main
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: 3.9
24 |       - name: Install
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install bandit
28 |       - name: Bandit
29 |         run: bandit -r -lll -ii .
30 | 


--------------------------------------------------------------------------------
/.github/workflows/cfn-nag.yml:
--------------------------------------------------------------------------------
 1 | name: CFN Nag
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     paths:
 7 |     - "test_infra/**"
 8 |     branches:
 9 |     - main
10 |   pull_request:
11 |     paths:
12 |     - "test_infra/**"
13 |     branches:
14 |     - main
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | env:
20 |   CDK_DEFAULT_ACCOUNT: 111111111111
21 |   CDK_DEFAULT_REGION: us-east-1
22 | 
23 | jobs:
24 |   build:
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - uses: actions/checkout@v4
28 |       - name: Use Node.js
29 |         uses: actions/setup-node@v4
30 |         with:
31 |           node-version: 16
32 |       - name: Cache Node.js modules
33 |         uses: actions/cache@v4
34 |         with:
35 |           path: ~/.npm
36 |           key: ${{ runner.OS }}-node-${{ hashFiles('**/package-lock.json') }}
37 |           restore-keys: |
38 |             ${{ runner.OS }}-node-
39 |             ${{ runner.OS }}-
40 |       - name: Install CDK 
41 |         run: |
42 |           npm install -g aws-cdk
43 |           cdk --version
44 |       - uses: actions/checkout@v4
45 |       - name: Set up Python
46 |         uses: actions/setup-python@v5
47 |         with:
48 |           python-version: 3.11
49 |       - name: Install Requirements
50 |         run: |
51 |           cd test_infra
52 |           python -m pip install --upgrade pip
53 |           python -m pip install poetry
54 |           poetry env use python
55 |           poetry env info
56 |           source $(poetry env info --path)/bin/activate
57 |           poetry install -vvv --no-root
58 |       - name: Set up cdk.json
59 |         run: |
60 |           cd test_infra
61 |           cat <<EOT >> cdk.context.json
62 |           {
63 |             "availability-zones:account=111111111111:region=us-east-1": [
64 |               "us-east-1a",
65 |               "us-east-1b",
66 |               "us-east-1c",
67 |               "us-east-1d",
68 |               "us-east-1e",
69 |               "us-east-1f"
70 |             ]
71 |           }
72 |           EOT
73 |           cat cdk.json | jq -r '.context.databases.neptune = true' | jq -r '.context.databases.oracle = true' | jq -r '.context.databases.sqlserver = true' > overwrite.cdk.json
74 |           rm cdk.json && mv overwrite.cdk.json cdk.json
75 |       - name: CDK Synth
76 |         run: |
77 |           cd test_infra
78 |           source $(poetry env info --path)/bin/activate
79 |           cdk synth
80 |       - uses: stelligent/cfn_nag@master
81 |         with:
82 |           input_path: test_infra/cdk.out
83 |           extra_args: --ignore-fatal
84 | 


--------------------------------------------------------------------------------
/.github/workflows/check-pytest-xfails.yml:
--------------------------------------------------------------------------------
 1 | name: Check Tests for Unspecific XFails
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   Check:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - name: check xfails
20 |         run: if grep -ro "@pytest.mark.xfail()" tests/; then echo "xfails must catch a specific error, e.g. '@pytest.mark.xfail(raises=NotImplementedError)'" && exit 1; else echo "success" && exit 0; fi


--------------------------------------------------------------------------------
/.github/workflows/dependabot-prs.yml:
--------------------------------------------------------------------------------
 1 | name: Dependabot Pull Request Metadata
 2 | on: pull_request_target
 3 | jobs:
 4 |   build:
 5 |     permissions:
 6 |       pull-requests: read
 7 |     runs-on: ubuntu-latest
 8 |     if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }}
 9 |     steps:
10 |     - name: Fetch Dependabot metadata
11 |       id: dependabot-metadata
12 |       uses: dependabot/fetch-metadata@v2
13 |       with:
14 |         alert-lookup: true
15 |         compat-lookup: true
16 |         github-token: ${{ secrets.GITHUB_TOKEN }}
17 |     - name: Add a label for all PRs with an alert state
18 |       if: ${{ steps.dependabot-metadata.outputs.alert-state != '' }}
19 |       run: gh pr edit "$PR_URL" --add-label "vulnerability"
20 |       env:
21 |         PR_URL: ${{github.event.pull_request.html_url}}
22 |         GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}


--------------------------------------------------------------------------------
/.github/workflows/git-hygiene.yml:
--------------------------------------------------------------------------------
 1 | name: "Close Stale Issues"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |   - cron: "0 */3 * * *"
 7 | 
 8 | jobs:
 9 |   cleanup:
10 |     permissions:
11 |       issues: write
12 |       pull-requests: write
13 |     runs-on: ubuntu-latest
14 |     name: Stale issue job
15 |     steps:
16 |     - uses: actions/stale@v9
17 |       with:
18 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
19 |         days-before-stale: 60
20 |         days-before-close: 7
21 |         exempt-issue-labels: 'needs-triage,help wanted,backlog'
22 |         exempt-pr-labels: 'needs-triage'
23 |         stale-issue-label: 'closing-soon'
24 |         operations-per-run: 100
25 |         enable-statistics: true
26 |         stale-issue-message: |
27 |           Marking this issue as stale due to inactivity. This helps our maintainers find and focus on the active issues. If this issue receives no comments in the next 7 days it will automatically be closed. 
28 |         stale-pr-label: 'closing-soon'
29 |         stale-pr-message: |
30 |           Marking this pull request as stale due to inactivity. This helps our maintainers find and focus on the active pull requests.
31 |         debug-only: false
32 |         ascending: true
33 |         exempt-all-milestones: true
34 |         exempt-all-assignees: true
35 | 


--------------------------------------------------------------------------------
/.github/workflows/minimal-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Minimal Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   Check:
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
21 |         platform: [ubuntu-latest, macos-latest, windows-latest]
22 |         exclude:
23 |           - python-version: 3.13
24 |             platform: windows-latest
25 | 
26 |     env:
27 |       AWS_DEFAULT_REGION: us-east-1
28 | 
29 |     runs-on: ${{ matrix.platform }}
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v4
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v5
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Upgrade Pip
38 |         run: python -m pip install --upgrade pip
39 |       - name: Install Poetry
40 |         run: python -m pip install poetry
41 |       - name: Install Requirements
42 |         run: |
43 |           poetry config virtualenvs.in-project true
44 |           poetry config virtualenvs.path .venv
45 |           poetry install -vvv
46 |       - name: Test Metadata
47 |         run: poetry run pytest tests/unit/test_metadata.py
48 |       - name: Test Session
49 |         run: poetry run pytest tests/unit/test_session.py
50 |       - name: Test Utils
51 |         run: poetry run pytest tests/unit/test_utils.py
52 |       - name: Test Moto
53 |         run: poetry run pytest -n 4 tests/unit/test_moto.py
54 | 


--------------------------------------------------------------------------------
/.github/workflows/minimum-response-time.yml:
--------------------------------------------------------------------------------
 1 | name: Issue Minimum Response Time
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |   - cron: "0 */3 * * *"
 6 | jobs:
 7 |   evaluate:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |     steps:
12 |       - name: Issue Minimum Response ⏰
13 |         uses: malachi-constant/issue-minimum-response@latest
14 |         with:
15 |           exempt_user_list: "github-actions[bot]"
16 |           exempt_labels: "help wanted"
17 |           exempt_authors: "malachi-constant,jaidisido,kukushking,LeonLuttenberger,cnfait,dependabot[bot]"
18 |           token: ${{secrets.GITHUB_TOKEN}}
19 |           label: needs-triage
20 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-linter.yml:
--------------------------------------------------------------------------------
 1 | name: Check PR title
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types:
 6 |       - opened
 7 |       - reopened
 8 |       - edited
 9 |       - synchronize
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     permissions:
15 |       statuses: write
16 |     steps:
17 |       - uses: aslafy-z/conventional-pr-title-action@v3
18 |         env:
19 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/snyk.yml:
--------------------------------------------------------------------------------
 1 | name: Snyk
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 |   schedule:
 7 |     - cron: "0 9 * * 1"  # runs each Monday at 9:00 UTC
 8 | 
 9 | permissions:
10 |   contents: read
11 |   security-events: write
12 | 
13 | jobs:
14 |   security:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - name: Run Snyk to check for vulnerabilities
19 |         uses: snyk/actions/python-3.8@master
20 |         continue-on-error: true  # To make sure that SARIF upload gets called
21 |         env:
22 |           SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
23 |         with:
24 |           args: --severity-threshold=high --sarif-file-output=snyk.sarif
25 |       - name: Upload result to GitHub Code Scanning
26 |         uses: github/codeql-action/upload-sarif@v3
27 |         with:
28 |           sarif_file: snyk.sarif
29 | 


--------------------------------------------------------------------------------
/.github/workflows/static-checking.yml:
--------------------------------------------------------------------------------
 1 | name: Static Checking
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   Check:
17 | 
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.9]
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install Requirements
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           python -m pip install poetry
33 |           poetry config virtualenvs.create false --local
34 |           poetry install --all-extras -vvv
35 |       - name: ruff format check
36 |         run: ruff format --check .
37 |       - name: ruff check
38 |         run: ruff check --output-format=github .
39 |       - name: mypy check
40 |         run: mypy --install-types --non-interactive awswrangler
41 |       - name: Documentation check
42 |         run: doc8 --max-line-length 120 docs/source
43 |       - name: Check poetry.lock consistency with pyproject.toml
44 |         run: poetry check --lock
45 | 


--------------------------------------------------------------------------------
/.github/workflows/unlabel-assigned-issue.yml:
--------------------------------------------------------------------------------
 1 | name: Unlabel Assigned Issues
 2 | on:
 3 |   issues:
 4 |     types:
 5 |       - assigned
 6 | permissions:
 7 |   contents: read
 8 | 
 9 | jobs:
10 |   unlabel-issue:
11 |     permissions:
12 |       issues: write  # for andymckay/labeler to label issues
13 |       pull-requests: write  # for andymckay/labeler to label PRs
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: unlabel-issues
17 |         uses: andymckay/labeler@master
18 |         with:
19 |           remove-labels: "needs-triage"
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | *__pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # poetry
 30 | poetry.toml
 31 | envs.toml
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | *.pytest_cache/
 55 | test-reports/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | *venv*/
 98 | *env/
 99 | venv*/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | 
109 | # Pycharm project settings
110 | .idea/
111 | 
112 | # Visual Studio Code project settings
113 | .vscode/
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | .dmypy.json
124 | dmypy.json
125 | 
126 | # Pyre type checker
127 | .pyre/
128 | 
129 | # MacOS
130 | .DS_Store
131 | 
132 | # Files generated by AWS Cloudformation package
133 | output/
134 | 
135 | # Development
136 | /dev/
137 | metrics/
138 | python/
139 | notes.txt
140 | 
141 | # SAM
142 | .aws-sam
143 | coverage/*
144 | building/*requirements*.txt
145 | building/arrow
146 | building/lambda/arrow
147 | /docs/coverage/
148 | /docs/build/
149 | /docs/source/_build/
150 | /docs/source/stubs/
151 | 
152 | # Swap
153 | *.swp
154 | 
155 | # CDK
156 | node_modules
157 | *package.json
158 | *package-lock.json
159 | *.cdk.staging
160 | *cdk.out
161 | *cdk.context.json
162 | 
163 | # ruff
164 | .ruff_cache/


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | formats: all
 3 | 
 4 | build:
 5 |   os: ubuntu-22.04
 6 |   tools:
 7 |     python: miniconda3-4.7
 8 | 
 9 | conda:
10 |   environment: docs/environment.yml
11 | 
12 | sphinx:
13 |   configuration: docs/source/conf.py
14 | 


--------------------------------------------------------------------------------
/.snyk:
--------------------------------------------------------------------------------
1 | ignore:
2 |     'SNYK-PYTHON-RDFLIB-1324490':
3 |         - '* > rdflib':
4 |             reason: 'No fix available'
5 |             expires: '2023-06-01T00:00:00.000Z'


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | AWS SDK for pandas
2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 3.12.0


--------------------------------------------------------------------------------
/adr/0001-record-architecture-decisions.md:
--------------------------------------------------------------------------------
 1 | # 1. Record architecture decisions
 2 | 
 3 | Date: 2023-03-08
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | We need to record the architectural decisions made on this project.
12 | 
13 | ## Decision
14 | 
15 | We will use Architecture Decision Records, as [described by Michael Nygard](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions).
16 | 
17 | ## Consequences
18 | 
19 | See Michael Nygard's article, linked above. For a lightweight ADR toolset, see Nat Pryce's [adr-tools](https://github.com/npryce/adr-tools).
20 | 


--------------------------------------------------------------------------------
/adr/0002-handling-unsupported-arguments-in-distributed-mode.md:
--------------------------------------------------------------------------------
 1 | # 2. Handling unsupported arguments in distributed mode
 2 | 
 3 | Date: 2023-03-09
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | Many of the API functions allow the user to pass their own `boto3` session, which will then be used by all the underlying `boto3` calls. With distributed computing, one of the limitations we have is that we cannot pass the `boto3` session to the worker nodes.
12 | 
13 | Boto3 session are not thread-safe, and therefore cannot be passed to Ray workers. The credentials behind a `boto3` session cannot be sent to Ray workers either, since sending credentials over the network is considered a security risk.
14 | 
15 | This raises the question of what to do when, in distributed mode, the customer passes arguments that are normally supported, but aren’t supported in distributed mode.
16 | 
17 | ## Decision
18 | 
19 | When a user passes arguments that are unsupported by distributed mode, the function should fail immediately.
20 | 
21 | The main alternative to this approach would be if a parameter such as a `boto3` session is passed, we should use it where possible. This could result in a situation where, when reading Parquet files from S3, the process of listing the files uses the `boto3` session whereas the reading of the Parquet files doesn’t. This could result in inconsistent behavior, as part of the function uses the extra parameters while the other part of it doesn’t.
22 | 
23 | Another alternative would simply be to ignore the unsupported parameters, while potentially outputting a warning. The main issue with this approach is that if a customer tells our API functions to use certain parameters, they expect those parameters to be used. By ignoring them, the the AWS SDK for pandas API would be doing something different from what the customer asked, without properly notifying them, and would thus lose the customer’s trust.
24 | 
25 | ## Consequences
26 | 
27 | In [PR#2501](https://github.com/aws/aws-sdk-pandas/pull/2051), the `validate_distributed_kwargs` annotation was introduced which can check for the presence of arguments that are unsupported in the distributed mode.
28 | 
29 | The annotation has also been applied for arguments such as `s3_additional_kwargs` and `version_id` when reading/writing data on S3.
30 | 
31 | 


--------------------------------------------------------------------------------
/adr/0003-use-typeddict-to-group-similar-parameters.md:
--------------------------------------------------------------------------------
 1 | # 3. Use TypedDict to group similar parameters
 2 | 
 3 | Date: 2023-03-10
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | *AWS SDK for pandas* API methods contain many parameters which are related to a specific behaviour or setting. For example, methods which have an option to update the Glue AWScatalog, such as `to_csv` and `to_parquet`, contain a list of parameters that define the settings for the table in AWS Glue. These settings include the table description, column comments, the table type, etc.
12 | 
13 | As a consequence, some of our functions have grown to include dozens of parameters. When reading the function signatures, it can be unclear which parameters are related to which functionality. For example, it's not immediately obvious that the parameter `column_comments` in `s3.to_parquet` only writes the column comments into the AWS Glue catalog, and not to S3.
14 | 
15 | ## Decision
16 | 
17 | Parameters that are related to similar functionality will be replaced by a single parameter of type [TypedDict](https://peps.python.org/pep-0589/). This will allow us to reduce the amount of parameters for our API functions, and also make it clearer that certain parameters are only related to specific functionalities.
18 | 
19 | For example, parameters related to Athena cache settings will be extracted into a parameter of type `AthenaCacheSettings`, parameters related to Ray settings will be extracted into `RayReadParquetSettings`, etc.
20 | 
21 | The usage of `TypedDict` allows the user to define the parameters as regular dictionaries with string keys, while empowering type checkers such as `mypy`. Alternately, implementations such as `AthenaCacheSettings` can be instantiated as classes.
22 | 
23 | ### Alternatives
24 | 
25 | The main alternative that was considered was the idea of using `dataclass` instead of `TypedDict`. The advantage of this alternative would be that default values for parameters could be defined directly in the class signature, rather than needing to be defined in the function which uses the parameter.
26 | 
27 | On the other hand, the main issue with using `dataclass` is that it would require the customer figure out which class needs to be imported. With `TypedDict`, this is just one of the options; the parameters can simply be passed as a typical Python dictionary.
28 | 
29 | This alternative was discussed in more detail as part of [PR#1855](https://github.com/aws/aws-sdk-pandas/pull/1855#issuecomment-1353618099).
30 | 
31 | ## Consequences
32 | 
33 | Subclasses of `TypedDict` such as `GlueCatalogParameters`, `AthenaCacheSettings`, `AthenaUNLOADSettings`, `AthenaCTASSettings` and `RaySettings` have been created. They are defined in the `wrangler.typing` module.
34 | 
35 | These parameters grouping can used in either of the following two ways:
36 | ```python
37 | wr.athena.read_sql_query(
38 |     "SELECT * FROM ...",,
39 |     ctas_approach=True,
40 |     athena_cache_settings={"max_cache_seconds": 900},
41 | )
42 | 
43 | wr.athena.read_sql_query(
44 |     "SELECT * FROM ...",,
45 |     ctas_approach=True,
46 |     athena_cache_settings=wr.typing.AthenaCacheSettings(
47 |         max_cache_seconds=900,
48 |     ),
49 | )
50 | ```
51 | 
52 | Many of our functions signatures have been changes to take advantage of this refactor. Many of these are breaking changes which will be released as part of the next major version: `3.0.0`.
53 | 


--------------------------------------------------------------------------------
/adr/0004-no-alter-iam-permissions.md:
--------------------------------------------------------------------------------
 1 | # 4. AWS SDK for pandas does not alter IAM permissions
 2 | 
 3 | Date: 2023-03-15
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | AWS SDK for pandas requires permissions to execute AWS API calls. Permissions are granted using AWS Identity and
12 | Access Management Policies that are attached to IAM entities - users or roles. 
13 | 
14 | ## Decision
15 | 
16 | AWS SDK for pandas does not alter (create, update, delete) IAM permissions policies attached to the IAM entities.
17 | 
18 | ## Consequences
19 | 
20 | It is users responsibility to ensure IAM entities they are using to execute the calls have the required permissions. 


--------------------------------------------------------------------------------
/adr/0005-move-dependencies-to-optional.md:
--------------------------------------------------------------------------------
 1 | # 5. Move dependencies to optional
 2 | 
 3 | Date: 2023-03-15
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | AWS SDK for pandas relies on external dependencies in some of its modules. These include `redshift-connector`, `gremlinpython` and `pymysql` to cite a few.
12 | 
13 | In versions 2.x and below, most of these packages were set as required, meaning they were installed regardless of whether the user actually needed them. This has introduced two major risks and issues as the number of dependencies increased:
14 | 1. **Security risk**: Unused dependencies increase the attack surface to manage. Users must scan them and ensure that they are kept up to date even though they don't need them
15 | 2. **Dependency hell**: Users must resolve dependencies for packages that they are not using. It can lead to dependency hell and prevent critical updates related to security patches and major bugs
16 | 
17 | ## Decision
18 | 
19 | A breaking change is introduced in version 3.x where the number of required dependencies is reduced to the most important ones, namely:
20 | * boto3
21 | * pandas
22 | * numpy
23 | * pyarrow
24 | * typing-extensions
25 | 
26 | ## Consequences
27 | 
28 | All other dependencies are moved to optional and must be installed by the user separately using pip install `awswrangler[dependency]`. For instance, the command to use the redshift APIs is `pip install awswrangler[redshift]`. Failing to do so raises an exception informing the user that the package is missing and how to install it
29 | 


--------------------------------------------------------------------------------
/adr/0006-deprecate-s3-merge-upsert-table.md:
--------------------------------------------------------------------------------
 1 | # 6. Deprecate wr.s3.merge_upsert_table
 2 | 
 3 | Date: 2023-03-15
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | AWS SDK for pandas `wr.s3.merge_upsert_table` is used to perform upsert (update else insert) onto an existing AWS Glue 
12 | Data Catalog table. It is a much simplified version of upsert functionality that is supported natively by Apache Hudi 
13 | and Athena Iceberg tables, and does not, for example, handle partitioned datasets.
14 | 
15 | ## Decision
16 | 
17 | To avoid poor user experience `wr.s3.merge_upsert_table` is deprecated and will be removed in 3.0 release.
18 | 
19 | ## Consequences
20 | 
21 | In [PR#2076](https://github.com/aws/aws-sdk-pandas/pull/2076), `wr.s3.merge_upsert_table` function was removed.
22 | 


--------------------------------------------------------------------------------
/adr/0007-design-of-engine-and-memory-format.md:
--------------------------------------------------------------------------------
 1 | # 7. Design of engine and memory format
 2 | 
 3 | Date: 2023-03-16
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | Ray and Modin are the two frameworks used to support running `awswrangler` APIs at scale. Adding them to the codebase requires significant refactoring work. The original approach considered was to handle both distributed and non-distributed code within the same modules. This quickly turned out to be undesirable as it affected the readability, maintainability and scalability of the codebase.
12 | 
13 | ## Decision
14 | 
15 | Version 3.x of the library introduces two new constructs, `engine` and `memory_format`, which are designed to address the aforementioned shortcomings of the original approach, but also provide additional functionality.
16 | 
17 | Currently `engine` takes one of two values: `python` (default) or `ray`, but additional engines could be onboarded in the future. The value is determined at import based on installed dependencies. The user can override this value with `wr.engine.set("engine_name")`. Likewise, `memory_format` can be set to `pandas` (default) or `modin` and overridden with `wr.memory_format.set("memory_format_name")`.
18 | 
19 | A custom dispatcher is used to register functions based on the execution and memory format values. For instance, if the `ray` engine is detected at import, then methods distributed with Ray are used instead of the default AWS SDK for pandas code.
20 | 
21 | ## Consequences
22 | 
23 | __The good__:
24 | 
25 | *Clear separation of concerns*: Distributed methods live outside non-distributed code, eliminating ugly if conditionals, allowing both to scale independently and making them easier to maintain in the future
26 | 
27 | *Better dispatching*: Adding a new engine/memory format is as simple as creating a new directory with its methods and registering them with the custom dispatcher based on the value of the engine or memory format
28 | 
29 | *Custom engine/memory format classes*: Give more flexibility than config when it comes to interacting with the engine and managing its state (initialising, registering, get/setting...)
30 | 
31 | __The bad__:
32 | 
33 | *Managing state*: Adding a custom dispatcher means that we must maintain its state. For instance, unregistering methods when a user sets a different engine (e.g. moving from ray to dask at execution time) is currently unsupported
34 | 
35 | *Detecting the engine*: Conditionals are simpler/easier when it comes to detecting an engine. With a custom dispatcher, the registration and dispatching process is more opaque/convoluted. For example, there is a higher risk of not realising that we are using a given engine vs another
36 | 
37 | __The ugly__:
38 | 
39 | *Unused arguments*: Each method registered with the dispatcher must accept the union of both non-distributed and distributed arguments, even though some would be unused. As the list of supported engines grows, so does the number of unused arguments. It also means that we must maintain the same list of arguments across the different versions of the method


--------------------------------------------------------------------------------
/adr/0008-switching-between-pyarrow-and-pandas-based-datasources-for-csv-json-i-o.md:
--------------------------------------------------------------------------------
 1 | # 8. Switching between PyArrow and Pandas based datasources for CSV/JSON I/O
 2 | 
 3 | Date: 2023-03-16
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | The reading and writing operations for CSV/JSON data in *AWS SDK for pandas* make use of the underlying functions in Pandas. For example, `wr.s3.read_csv` will open a stream of data from S3 and then invoke `pandas.read_csv`. This allows the library to fully support all the arguments which are supported by the underlying Pandas functions. Functions such as `wr.s3.read_csv` or `wr.s3.to_json` accept a `**kwargs` parameter which forwards all parameters to `pandas.read_csv` and `pandas.to_json` automatically.
12 | 
13 | From version 3.0.0 onward, *AWS SDK for pandas* supports Ray and Modin. When those two libraries are installed, all aforementioned I/O functions will be distributed on a Ray cluster. In the background, this means that all the I/O functions for S3 are running as part of a [custom Ray data source](https://docs.ray.io/en/latest/_modules/ray/data/datasource/datasource.html). Data is then returned in blocks, which form the Modin DataFrame.
14 | 
15 | The issue is that the Pandas I/O functions work very slowly in the Ray datasource compared with the equivalent I/O functions in PyArrow. Therefore, calling `pyarrow.csv.read_csv` is significantly faster than calling `pandas.read_csv` in the background.
16 | 
17 | However, the PyArrow I/O functions do not support the same set of parameters as the ones in Pandas. As a consequence, whereas the PyArrow functions offer greater performance, they come at the cost of feature parity between the non-distributed mode and the distributed mode.
18 | 
19 | For reference, loading 5 GiB of CSV data with the PyArrow functions took around 30 seconds, compared to 120 seconds with the Pandas functions in the same scenario.
20 | For writing back to S3, the speed-up is around 2x.
21 | 
22 | ## Decision
23 | 
24 | In order to maximize both performance without losing feature parity, we implemented logic whereby if the user passes a set of parameters which are supported by PyArrow, the library uses PyArrow for reading/writing. If not, the library defaults to the slower Pandas functions, which will support the set of parameter.
25 | 
26 | The following example will illustrate the difference:
27 | 
28 | ```python
29 | # This will be loaded by PyArrow, as `doublequote` is supported
30 | wr.s3.read_csv(
31 |     path="s3://my-bucket/my-path/",
32 |     dataset=True,
33 |     doublequote=False,
34 | )
35 | 
36 | # This will be loaded using the Pandas I/O functions, as `comment` is not supported by PyArrow
37 | wr.s3.read_csv(
38 |     path="s3://my-bucket/my-path/",
39 |     dataset=True,
40 |     comment="#",
41 | )
42 | ```
43 | 
44 | This logic is applied to the following functions:
45 | 1. `wr.s3.read_csv`
46 | 2. `wr.s3.read_json`
47 | 3. `wr.s3.to_json`
48 | 4. `wr.s3.to_csv`
49 | 
50 | ## Consequences
51 | 
52 | The logic of switching between using PyArrow or Pandas functions in background was implemented as part of [#1699](https://github.com/aws/aws-sdk-pandas/pull/1699). It was later expanded to support more parameters in [#2008](https://github.com/aws/aws-sdk-pandas/pull/2008) and [#2019](https://github.com/aws/aws-sdk-pandas/pull/2019).
53 | 


--------------------------------------------------------------------------------
/adr/0009-lazy-engine-initialization.md:
--------------------------------------------------------------------------------
 1 | # 9. Engine selection and lazy initialization
 2 | 
 3 | Date: 2023-05-17
 4 | 
 5 | ## Status
 6 | 
 7 | Accepted
 8 | 
 9 | ## Context
10 | 
11 | In distributed mode, three approaches are possible when it comes to selecting and initializing a Ray engine:
12 | 1. Initialize the Ray runtime at import (current default). This option causes the least friction to the user but assumes that installing Ray as an optional dependency is enough to enable distributed mode. Moreover, the user cannot prevent/delay Ray initialization (as it's done at import)
13 | 2. Initialize the Ray runtime on the first distributed API call. The user can prevent Ray initialization by switching the engine/memory format with environment variables or between import and the first awswrangler distributed API call. However, by default this approach still assumes that installing Ray is equivalent to enabling distributed mode
14 | 3. Wait for the user to enable distributed mode, via environment variables and/or via `wr.engine.set`. This option makes no assumption on which mode to use (distributed vs non-distributed). Non-distributed would be the default and it's up to the user to switch the engine/memory format
15 | 
16 | ## Decision
17 | 
18 | Option #1 is inflexible and gives little control to the user, while option #3 introduces too much friction and puts the burden on the user. Option #2 on the other hand gives full flexibility to the user while providing a sane default.
19 | 
20 | ## Consequences
21 | 
22 | The only difference between the current default and the suggested approach is to delay engine initialization, which is not a breaking change. However, it means that in certain situations more than one Ray instance is initialized. For instance, when running tests across multiple threads, each thread runs its own Ray runtime.
23 | 


--------------------------------------------------------------------------------
/awswrangler/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initial Module.
 2 | 
 3 | Source repository: https://github.com/aws/aws-sdk-pandas
 4 | Documentation: https://aws-sdk-pandas.readthedocs.io/
 5 | 
 6 | """
 7 | 
 8 | import logging as _logging
 9 | 
10 | from awswrangler import (
11 |     athena,
12 |     catalog,
13 |     chime,
14 |     cleanrooms,
15 |     cloudwatch,
16 |     data_api,
17 |     data_quality,
18 |     dynamodb,
19 |     emr,
20 |     emr_serverless,
21 |     exceptions,
22 |     mysql,
23 |     neptune,
24 |     opensearch,
25 |     oracle,
26 |     postgresql,
27 |     quicksight,
28 |     redshift,
29 |     s3,
30 |     secretsmanager,
31 |     sqlserver,
32 |     sts,
33 |     timestream,
34 |     typing,
35 | )
36 | from awswrangler.__metadata__ import __description__, __license__, __title__, __version__
37 | from awswrangler._config import config
38 | from awswrangler._distributed import EngineEnum, MemoryFormatEnum, engine, memory_format
39 | 
40 | engine.register()
41 | 
42 | __all__ = [
43 |     "athena",
44 |     "catalog",
45 |     "chime",
46 |     "cleanrooms",
47 |     "cloudwatch",
48 |     "emr",
49 |     "emr_serverless",
50 |     "data_api",
51 |     "data_quality",
52 |     "dynamodb",
53 |     "exceptions",
54 |     "opensearch",
55 |     "oracle",
56 |     "quicksight",
57 |     "s3",
58 |     "sts",
59 |     "redshift",
60 |     "mysql",
61 |     "neptune",
62 |     "postgresql",
63 |     "secretsmanager",
64 |     "sqlserver",
65 |     "config",
66 |     "engine",
67 |     "memory_format",
68 |     "timestream",
69 |     "typing",
70 |     "__description__",
71 |     "__license__",
72 |     "__title__",
73 |     "__version__",
74 |     "EngineEnum",
75 |     "MemoryFormatEnum",
76 | ]
77 | 
78 | 
79 | _logging.getLogger("awswrangler").addHandler(_logging.NullHandler())
80 | 


--------------------------------------------------------------------------------
/awswrangler/__metadata__.py:
--------------------------------------------------------------------------------
 1 | """Metadata Module.
 2 | 
 3 | Source repository: https://github.com/aws/aws-sdk-pandas
 4 | Documentation: https://aws-sdk-pandas.readthedocs.io/
 5 | 
 6 | """
 7 | 
 8 | __title__: str = "awswrangler"
 9 | __description__: str = "Pandas on AWS."
10 | __version__: str = "3.12.0"
11 | __license__: str = "Apache License 2.0"
12 | 


--------------------------------------------------------------------------------
/awswrangler/_executor.py:
--------------------------------------------------------------------------------
 1 | """Executor Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import concurrent.futures
 6 | import itertools
 7 | import logging
 8 | from abc import ABC, abstractmethod
 9 | from typing import TYPE_CHECKING, Any, Callable, TypeVar
10 | 
11 | from awswrangler import _utils
12 | from awswrangler._distributed import engine
13 | 
14 | if TYPE_CHECKING:
15 |     from botocore.client import BaseClient
16 | 
17 | _logger: logging.Logger = logging.getLogger(__name__)
18 | 
19 | 
20 | MapOutputType = TypeVar("MapOutputType")
21 | 
22 | 
23 | class _BaseExecutor(ABC):
24 |     def __init__(self) -> None:
25 |         _logger.debug("Creating an %s executor: ", self.__class__)
26 | 
27 |     @abstractmethod
28 |     def map(
29 |         self,
30 |         func: Callable[..., MapOutputType],
31 |         boto3_client: "BaseClient" | None,
32 |         *args: Any,
33 |     ) -> list[MapOutputType]:
34 |         pass
35 | 
36 | 
37 | class _ThreadPoolExecutor(_BaseExecutor):
38 |     def __init__(self, use_threads: bool | int):
39 |         super().__init__()
40 |         self._exec: concurrent.futures.ThreadPoolExecutor | None = None
41 |         self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
42 |         if self._cpus > 1:
43 |             _logger.debug("Initializing ThreadPoolExecutor with %d workers", self._cpus)
44 |             self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus)
45 | 
46 |     def map(
47 |         self, func: Callable[..., MapOutputType], boto3_client: "BaseClient" | None, *args: Any
48 |     ) -> list[MapOutputType]:
49 |         """Map iterables to multi-threaded function."""
50 |         _logger.debug("Map: %s", func)
51 |         if self._exec is not None:
52 |             iterables = (itertools.repeat(boto3_client), *args)
53 |             return list(self._exec.map(func, *iterables))
54 |         # Single-threaded
55 |         return list(map(func, *(itertools.repeat(boto3_client), *args)))
56 | 
57 | 
58 | @engine.dispatch_on_engine
59 | def _get_executor(use_threads: bool | int, **kwargs: Any) -> _BaseExecutor:
60 |     # kwargs allows for parameter that will be used by other variants of this function,
61 |     # such as `parallelism` for _get_ray_executor
62 |     return _ThreadPoolExecutor(use_threads)
63 | 


--------------------------------------------------------------------------------
/awswrangler/_sql_utils.py:
--------------------------------------------------------------------------------
 1 | """SQL utilities."""
 2 | 
 3 | import re
 4 | 
 5 | from awswrangler import exceptions
 6 | 
 7 | 
 8 | def identifier(sql: str, sql_mode: str = "mysql") -> str:
 9 |     """
10 |     Turn the input into an escaped SQL identifier, such as the name of a table or column.
11 | 
12 |     sql: str
13 |         Identifier to use in SQL.
14 |     sql_mode: str
15 |         "mysql" for default MySQL identifiers (backticks), "ansi" for ANSI-compatible identifiers (double quotes), or
16 |         "mssql" for MSSQL identifiers (square brackets).
17 | 
18 |     Returns
19 |     -------
20 |     str
21 |         Escaped SQL identifier.
22 |     """
23 |     if not isinstance(sql, str):
24 |         raise exceptions.InvalidArgumentValue("identifier must be a str")
25 | 
26 |     if len(sql) == 0:
27 |         raise exceptions.InvalidArgumentValue("identifier must be > 0 characters in length")
28 | 
29 |     if re.search(r"[^a-zA-Z0-9-_ ]", sql):
30 |         raise exceptions.InvalidArgumentValue(
31 |             "identifier must contain only alphanumeric characters, spaces, underscores, or hyphens"
32 |         )
33 | 
34 |     if sql_mode == "mysql":
35 |         return f"`{sql}`"
36 |     elif sql_mode == "ansi":
37 |         return f'"{sql}"'
38 |     elif sql_mode == "mssql":
39 |         return f"[{sql}]"
40 | 
41 |     raise ValueError(f"Unknown SQL MODE: {sql_mode}")
42 | 


--------------------------------------------------------------------------------
/awswrangler/annotations.py:
--------------------------------------------------------------------------------
 1 | """Annotations Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import warnings
 6 | from functools import wraps
 7 | from typing import Any, Callable, TypeVar, cast
 8 | 
 9 | from awswrangler._config import _insert_str, config
10 | 
11 | FunctionType = TypeVar("FunctionType", bound=Callable[..., Any])
12 | 
13 | 
14 | class SDKPandasDeprecatedWarning(Warning):
15 |     """Deprecated Warning."""
16 | 
17 | 
18 | class SDKPandasExperimentalWarning(Warning):
19 |     """Experimental Warning."""
20 | 
21 | 
22 | def _inject_note(
23 |     doc: str | None,
24 |     message: str,
25 | ) -> str | None:
26 |     token: str = "\n    Parameters"
27 |     if not doc or token not in doc:
28 |         return doc
29 |     note: str = f"\n\n    Warning\n    ----\n    {message}\n\n"
30 |     return _insert_str(text=doc, token=token, insert=note)
31 | 
32 | 
33 | def warn_message(
34 |     message: str,
35 |     warning_class: type[Warning],
36 |     stacklevel: int = 2,
37 | ) -> Callable[[FunctionType], FunctionType]:
38 |     """Decorate functions with this to print warnings."""
39 | 
40 |     def decorator(func: FunctionType) -> FunctionType:
41 |         @wraps(func)
42 |         def inner(*args: Any, **kwargs: Any) -> Any:
43 |             if not config.suppress_warnings:
44 |                 warnings.warn(f"`{func.__name__}`: {message}", warning_class, stacklevel=stacklevel)
45 | 
46 |             return func(*args, **kwargs)
47 | 
48 |         inner.__doc__ = _inject_note(
49 |             doc=func.__doc__,
50 |             message=message,
51 |         )
52 | 
53 |         return cast(FunctionType, inner)
54 | 
55 |     return decorator
56 | 
57 | 
58 | Deprecated = warn_message(
59 |     "This API is deprecated and will be removed in future AWS SDK for Pandas releases. ",
60 |     SDKPandasDeprecatedWarning,
61 | )
62 | 
63 | 
64 | Experimental = warn_message(
65 |     "This API is experimental and may change in future AWS SDK for Pandas releases. ",
66 |     SDKPandasExperimentalWarning,
67 | )
68 | 


--------------------------------------------------------------------------------
/awswrangler/athena/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon Athena Module."""
 2 | 
 3 | from awswrangler.athena._executions import (  # noqa
 4 |     get_query_execution,
 5 |     stop_query_execution,
 6 |     start_query_execution,
 7 |     wait_query,
 8 | )
 9 | from awswrangler.athena._spark import create_spark_session, run_spark_calculation
10 | from awswrangler.athena._statements import (
11 |     create_prepared_statement,
12 |     delete_prepared_statement,
13 |     list_prepared_statements,
14 | )
15 | from awswrangler.athena._read import (
16 |     get_query_results,
17 |     read_sql_query,
18 |     read_sql_table,
19 |     unload,
20 | )
21 | from awswrangler.athena._utils import (
22 |     create_athena_bucket,
23 |     create_ctas_table,
24 |     describe_table,
25 |     generate_create_query,
26 |     get_named_query_statement,
27 |     get_query_columns_types,
28 |     get_query_executions,
29 |     get_work_group,
30 |     list_query_executions,
31 |     repair_table,
32 |     show_create_table,
33 | )
34 | from awswrangler.athena._write_iceberg import to_iceberg, delete_from_iceberg_table
35 | 
36 | 
37 | __all__ = [
38 |     "read_sql_query",
39 |     "read_sql_table",
40 |     "create_athena_bucket",
41 |     "describe_table",
42 |     "get_query_columns_types",
43 |     "get_query_execution",
44 |     "get_query_executions",
45 |     "get_query_results",
46 |     "get_named_query_statement",
47 |     "get_work_group",
48 |     "generate_create_query",
49 |     "list_query_executions",
50 |     "repair_table",
51 |     "create_spark_session",
52 |     "run_spark_calculation",
53 |     "create_ctas_table",
54 |     "show_create_table",
55 |     "start_query_execution",
56 |     "stop_query_execution",
57 |     "unload",
58 |     "wait_query",
59 |     "create_prepared_statement",
60 |     "list_prepared_statements",
61 |     "delete_prepared_statement",
62 |     "to_iceberg",
63 |     "delete_from_iceberg_table",
64 | ]
65 | 


--------------------------------------------------------------------------------
/awswrangler/athena/_executions.pyi:
--------------------------------------------------------------------------------
 1 | from typing import (
 2 |     Any,
 3 |     Literal,
 4 |     overload,
 5 | )
 6 | 
 7 | import boto3
 8 | 
 9 | from awswrangler import typing
10 | 
11 | @overload
12 | def start_query_execution(
13 |     sql: str,
14 |     database: str | None = ...,
15 |     s3_output: str | None = ...,
16 |     workgroup: str = ...,
17 |     encryption: str | None = ...,
18 |     kms_key: str | None = ...,
19 |     params: dict[str, Any] | list[str] | None = ...,
20 |     paramstyle: Literal["qmark", "named"] = ...,
21 |     boto3_session: boto3.Session | None = ...,
22 |     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
23 |     athena_query_wait_polling_delay: float = ...,
24 |     data_source: str | None = ...,
25 |     wait: Literal[False] = ...,
26 | ) -> str: ...
27 | @overload
28 | def start_query_execution(
29 |     sql: str,
30 |     *,
31 |     database: str | None = ...,
32 |     s3_output: str | None = ...,
33 |     workgroup: str = ...,
34 |     encryption: str | None = ...,
35 |     kms_key: str | None = ...,
36 |     params: dict[str, Any] | list[str] | None = ...,
37 |     paramstyle: Literal["qmark", "named"] = ...,
38 |     boto3_session: boto3.Session | None = ...,
39 |     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
40 |     athena_query_wait_polling_delay: float = ...,
41 |     data_source: str | None = ...,
42 |     wait: Literal[True],
43 | ) -> dict[str, Any]: ...
44 | @overload
45 | def start_query_execution(
46 |     sql: str,
47 |     *,
48 |     database: str | None = ...,
49 |     s3_output: str | None = ...,
50 |     workgroup: str = ...,
51 |     encryption: str | None = ...,
52 |     kms_key: str | None = ...,
53 |     params: dict[str, Any] | list[str] | None = ...,
54 |     paramstyle: Literal["qmark", "named"] = ...,
55 |     boto3_session: boto3.Session | None = ...,
56 |     athena_cache_settings: typing.AthenaCacheSettings | None = ...,
57 |     athena_query_wait_polling_delay: float = ...,
58 |     data_source: str | None = ...,
59 |     wait: bool,
60 | ) -> str | dict[str, Any]: ...
61 | def stop_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = ...) -> None: ...
62 | def wait_query(
63 |     query_execution_id: str,
64 |     boto3_session: boto3.Session | None = None,
65 |     athena_query_wait_polling_delay: float = ...,
66 | ) -> dict[str, Any]: ...
67 | def get_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = ...) -> dict[str, Any]: ...
68 | 


--------------------------------------------------------------------------------
/awswrangler/catalog/__init__.py:
--------------------------------------------------------------------------------
  1 | """Amazon Glue Catalog Module."""
  2 | 
  3 | from awswrangler.catalog._add import (
  4 |     add_column,
  5 |     add_csv_partitions,
  6 |     add_json_partitions,
  7 |     add_orc_partitions,
  8 |     add_parquet_partitions,
  9 | )
 10 | from awswrangler.catalog._create import (
 11 |     _create_csv_table,
 12 |     _create_json_table,
 13 |     _create_parquet_table,
 14 |     create_csv_table,
 15 |     create_database,
 16 |     create_json_table,
 17 |     create_orc_table,
 18 |     create_parquet_table,
 19 |     overwrite_table_parameters,
 20 |     upsert_table_parameters,
 21 | )
 22 | from awswrangler.catalog._delete import (
 23 |     delete_all_partitions,
 24 |     delete_column,
 25 |     delete_database,
 26 |     delete_partitions,
 27 |     delete_table_if_exists,
 28 | )
 29 | from awswrangler.catalog._get import (
 30 |     _get_table_input,
 31 |     databases,
 32 |     get_columns_comments,
 33 |     get_columns_parameters,
 34 |     get_connection,
 35 |     get_csv_partitions,
 36 |     get_databases,
 37 |     get_parquet_partitions,
 38 |     get_partitions,
 39 |     get_table_description,
 40 |     get_table_location,
 41 |     get_table_number_of_versions,
 42 |     get_table_parameters,
 43 |     get_table_types,
 44 |     get_table_versions,
 45 |     get_tables,
 46 |     search_tables,
 47 |     table,
 48 |     tables,
 49 | )
 50 | from awswrangler.catalog._utils import (
 51 |     does_table_exist,
 52 |     drop_duplicated_columns,
 53 |     extract_athena_types,
 54 |     rename_duplicated_columns,
 55 |     sanitize_column_name,
 56 |     sanitize_dataframe_columns_names,
 57 |     sanitize_table_name,
 58 | )
 59 | 
 60 | __all__ = [
 61 |     "add_column",
 62 |     "add_csv_partitions",
 63 |     "add_json_partitions",
 64 |     "add_parquet_partitions",
 65 |     "add_orc_partitions",
 66 |     "does_table_exist",
 67 |     "delete_column",
 68 |     "drop_duplicated_columns",
 69 |     "extract_athena_types",
 70 |     "rename_duplicated_columns",
 71 |     "sanitize_column_name",
 72 |     "sanitize_dataframe_columns_names",
 73 |     "sanitize_table_name",
 74 |     "_create_csv_table",
 75 |     "_create_parquet_table",
 76 |     "_create_json_table",
 77 |     "create_csv_table",
 78 |     "create_database",
 79 |     "create_parquet_table",
 80 |     "create_orc_table",
 81 |     "create_json_table",
 82 |     "overwrite_table_parameters",
 83 |     "upsert_table_parameters",
 84 |     "_get_table_input",
 85 |     "databases",
 86 |     "get_columns_comments",
 87 |     "get_columns_parameters",
 88 |     "get_connection",
 89 |     "get_csv_partitions",
 90 |     "get_databases",
 91 |     "get_parquet_partitions",
 92 |     "get_partitions",
 93 |     "get_table_description",
 94 |     "get_table_location",
 95 |     "get_table_number_of_versions",
 96 |     "get_table_parameters",
 97 |     "get_table_types",
 98 |     "get_table_versions",
 99 |     "get_tables",
100 |     "search_tables",
101 |     "table",
102 |     "tables",
103 |     "delete_database",
104 |     "delete_table_if_exists",
105 |     "delete_partitions",
106 |     "delete_all_partitions",
107 | ]
108 | 


--------------------------------------------------------------------------------
/awswrangler/chime.py:
--------------------------------------------------------------------------------
 1 | """Chime Message/Notification module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import json
 6 | import logging
 7 | from typing import Any
 8 | from urllib.error import HTTPError, URLError
 9 | from urllib.request import Request, urlopen
10 | 
11 | _logger: logging.Logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def post_message(webhook: str, message: str) -> Any | None:
15 |     """Send message on an existing Chime Chat rooms.
16 | 
17 |     Parameters
18 |     ----------
19 |     webhook
20 |         Contains all the authentication information to send the message
21 |     message
22 |         The actual message which needs to be posted on Slack channel
23 | 
24 |     Returns
25 |     -------
26 |         The response from Chime
27 |     """
28 |     response = None
29 |     chime_message = {"Content": f"Message: {message}"}
30 |     req = Request(webhook, json.dumps(chime_message).encode("utf-8"))
31 |     try:
32 |         response = urlopen(req)
33 |         _logger.info("Message posted on Chime. Got respone as %s", response.read())
34 |     except HTTPError as e:
35 |         _logger.exception("Request failed: %d %s", e.code, e.reason)
36 |     except URLError as e:
37 |         _logger.exception("Server connection failed: %s", e.reason)
38 |     return response
39 | 


--------------------------------------------------------------------------------
/awswrangler/cleanrooms/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon Clean Rooms Module."""
 2 | 
 3 | from awswrangler.cleanrooms._read import read_sql_query
 4 | from awswrangler.cleanrooms._utils import wait_query
 5 | 
 6 | __all__ = [
 7 |     "read_sql_query",
 8 |     "wait_query",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/awswrangler/cleanrooms/_utils.py:
--------------------------------------------------------------------------------
 1 | """Utilities Module for Amazon Clean Rooms."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | import time
 7 | from typing import TYPE_CHECKING
 8 | 
 9 | import boto3
10 | 
11 | from awswrangler import _utils, exceptions
12 | 
13 | if TYPE_CHECKING:
14 |     from mypy_boto3_cleanrooms.type_defs import GetProtectedQueryOutputTypeDef
15 | 
16 | _QUERY_FINAL_STATES: list[str] = ["CANCELLED", "FAILED", "SUCCESS", "TIMED_OUT"]
17 | _QUERY_WAIT_POLLING_DELAY: float = 2  # SECONDS
18 | 
19 | _logger: logging.Logger = logging.getLogger(__name__)
20 | 
21 | 
22 | def wait_query(
23 |     membership_id: str, query_id: str, boto3_session: boto3.Session | None = None
24 | ) -> "GetProtectedQueryOutputTypeDef":
25 |     """Wait for the Clean Rooms protected query to end.
26 | 
27 |     Parameters
28 |     ----------
29 |     membership_id
30 |         Membership ID
31 |     query_id
32 |         Protected query execution ID
33 |     boto3_session
34 |         The default boto3 session will be used if **boto3_session** is ``None``.
35 | 
36 |     Returns
37 |     -------
38 |     ``Dict[str, Any]``
39 |         Dictionary with the get_protected_query response.
40 | 
41 |     Raises
42 |     ------
43 |     exceptions.QueryFailed
44 |         Raises exception with error message if protected query is cancelled, times out or fails.
45 | 
46 |     Examples
47 |     --------
48 |     >>> import awswrangler as wr
49 |     >>> res = wr.cleanrooms.wait_query(membership_id='membership-id', query_id='query-id')
50 |     """
51 |     client_cleanrooms = _utils.client(service_name="cleanrooms", session=boto3_session)
52 |     state = "SUBMITTED"
53 | 
54 |     while state not in _QUERY_FINAL_STATES:
55 |         time.sleep(_QUERY_WAIT_POLLING_DELAY)
56 |         response = client_cleanrooms.get_protected_query(
57 |             membershipIdentifier=membership_id, protectedQueryIdentifier=query_id
58 |         )
59 |         state = response["protectedQuery"].get("status")  # type: ignore[assignment]
60 | 
61 |     _logger.debug("state: %s", state)
62 |     if state != "SUCCESS":
63 |         raise exceptions.QueryFailed(response["protectedQuery"].get("Error"))
64 |     return response
65 | 


--------------------------------------------------------------------------------
/awswrangler/data_api/__init__.py:
--------------------------------------------------------------------------------
1 | """Data API Service Module for RDS and Redshift."""
2 | 
3 | from awswrangler.data_api import rds, redshift
4 | 
5 | __all__ = [
6 |     "redshift",
7 |     "rds",
8 | ]
9 | 


--------------------------------------------------------------------------------
/awswrangler/data_quality/__init__.py:
--------------------------------------------------------------------------------
 1 | """AWS Glue Data Quality package."""
 2 | 
 3 | from awswrangler.data_quality._create import (
 4 |     create_recommendation_ruleset,
 5 |     create_ruleset,
 6 |     evaluate_ruleset,
 7 |     update_ruleset,
 8 | )
 9 | from awswrangler.data_quality._get import get_ruleset
10 | 
11 | __all__ = [
12 |     "create_recommendation_ruleset",
13 |     "create_ruleset",
14 |     "evaluate_ruleset",
15 |     "get_ruleset",
16 |     "update_ruleset",
17 | ]
18 | 


--------------------------------------------------------------------------------
/awswrangler/data_quality/_get.py:
--------------------------------------------------------------------------------
 1 | """AWS Glue Data Quality Get Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import cast
 6 | 
 7 | import boto3
 8 | 
 9 | import awswrangler.pandas as pd
10 | from awswrangler.data_quality._utils import _get_ruleset, _rules_to_df
11 | 
12 | 
13 | def get_ruleset(
14 |     name: str | list[str],
15 |     boto3_session: boto3.Session | None = None,
16 | ) -> pd.DataFrame:
17 |     """Get a Data Quality ruleset.
18 | 
19 |     Parameters
20 |     ----------
21 |     name
22 |         Ruleset name or list of names.
23 |     boto3_session
24 |         The default boto3 session will be used if **boto3_session** is ``None``.
25 | 
26 |     Returns
27 |     -------
28 |         Data frame with ruleset(s) details.
29 | 
30 |     Examples
31 |     --------
32 |     Get single ruleset
33 |     >>> import awswrangler as wr
34 |     >>> df_ruleset = wr.data_quality.get_ruleset(name="my_ruleset")
35 | 
36 |     Get multiple rulesets. A column with the ruleset name is added to the data frame
37 |     >>> import awswrangler as wr
38 |     >>> df_rulesets = wr.data_quality.get_ruleset(name=["ruleset_1", "ruleset_2"])
39 |     """
40 |     ruleset_names: list[str] = name if isinstance(name, list) else [name]
41 |     dfs: list[pd.DataFrame] = []
42 |     for ruleset_name in ruleset_names:
43 |         rules = cast(str, _get_ruleset(ruleset_name=ruleset_name, boto3_session=boto3_session)["Ruleset"])
44 |         df = _rules_to_df(rules=rules)
45 |         if len(ruleset_names) > 1:
46 |             df["ruleset"] = ruleset_name
47 |         dfs.append(df)
48 |     return pd.concat(dfs)
49 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | """Distributed Module."""
2 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/__init__.py:
--------------------------------------------------------------------------------
 1 | """Ray Module."""
 2 | 
 3 | from awswrangler.distributed.ray._core import RayLogger, initialize_ray, ray_get, ray_logger, ray_remote
 4 | 
 5 | __all__ = [
 6 |     "RayLogger",
 7 |     "initialize_ray",
 8 |     "ray_get",
 9 |     "ray_logger",
10 |     "ray_remote",
11 | ]
12 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/_core.pyi:
--------------------------------------------------------------------------------
 1 | """Ray Module."""
 2 | 
 3 | import logging
 4 | from typing import Any, Callable
 5 | 
 6 | class RayLogger:
 7 |     def __init__(
 8 |         self,
 9 |         log_level: int = logging.INFO,
10 |         format: str = "%(asctime)s::%(levelname)-2s::%(name)s::%(message)s",
11 |         datefmt: str = "%Y-%m-%d %H:%M:%S",
12 |     ): ...
13 |     def get_logger(self, name: str | Any = None) -> logging.Logger: ...
14 | 
15 | def ray_logger(function: Callable[..., Any]) -> Callable[..., Any]: ...
16 | def ray_remote(**options: Any) -> Callable[..., Any]: ...
17 | def ray_get(futures: list[Any]) -> Any: ...
18 | def initialize_ray(
19 |     address: str | None = None,
20 |     redis_password: str | None = None,
21 |     ignore_reinit_error: bool | None = True,
22 |     include_dashboard: bool | None = False,
23 |     log_to_driver: bool | None = True,
24 |     object_store_memory: int | None = None,
25 |     cpu_count: int | None = None,
26 |     gpu_count: int | None = 0,
27 | ) -> None: ...
28 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/_executor.py:
--------------------------------------------------------------------------------
 1 | """Ray Executor Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import itertools
 6 | import logging
 7 | from typing import TYPE_CHECKING, Any, Callable, TypeVar
 8 | 
 9 | import ray
10 | import ray.actor
11 | 
12 | from awswrangler import engine
13 | from awswrangler._executor import _BaseExecutor
14 | 
15 | if TYPE_CHECKING:
16 |     from botocore.client import BaseClient
17 | 
18 | _logger: logging.Logger = logging.getLogger(__name__)
19 | 
20 | MapOutputType = TypeVar("MapOutputType")
21 | 
22 | 
23 | class _RayExecutor(_BaseExecutor):
24 |     def map(self, func: Callable[..., MapOutputType], _: "BaseClient" | None, *args: Any) -> list[MapOutputType]:
25 |         """Map func and return ray futures."""
26 |         _logger.debug("Ray map: %s", func)
27 |         # Discard boto3 client
28 |         return list(func(*arg) for arg in zip(itertools.repeat(None), *args))
29 | 
30 | 
31 | @ray.remote
32 | class AsyncActor:
33 |     async def run_concurrent(self, func: Callable[..., MapOutputType], *args: Any) -> MapOutputType:
34 |         return func(*args)
35 | 
36 | 
37 | class _RayMaxConcurrencyExecutor(_BaseExecutor):
38 |     def __init__(self, max_concurrency: int) -> None:
39 |         super().__init__()
40 | 
41 |         _logger.debug("Initializing Ray Actor with maximum concurrency %d", max_concurrency)
42 |         self._actor: ray.actor.ActorHandle = AsyncActor.options(max_concurrency=max_concurrency).remote()  # type: ignore[attr-defined]
43 | 
44 |     def map(self, func: Callable[..., MapOutputType], _: "BaseClient" | None, *args: Any) -> list[MapOutputType]:
45 |         """Map func and return ray futures."""
46 |         _logger.debug("Ray map: %s", func)
47 | 
48 |         # Discard boto3 client
49 |         iterables = (itertools.repeat(None), *args)
50 |         func_python = engine.dispatch_func(func, "python")
51 | 
52 |         return [self._actor.run_concurrent.remote(func_python, *arg) for arg in zip(*iterables)]
53 | 
54 | 
55 | def _get_ray_executor(use_threads: bool | int, **kwargs: Any) -> _BaseExecutor:
56 |     # We want the _RayMaxConcurrencyExecutor only to be used when the `parallelism` parameter is specified
57 |     parallelism: int | None = kwargs.get("ray_parallelism")
58 |     return _RayMaxConcurrencyExecutor(parallelism) if parallelism else _RayExecutor()
59 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/_utils.py:
--------------------------------------------------------------------------------
 1 | """Ray utilities (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import ray
 6 | from ray.util.placement_group import PlacementGroup
 7 | 
 8 | 
 9 | # https://github.com/ray-project/ray/blob/master/python/ray/data/_internal/util.py#L87
10 | def _estimate_avail_cpus(cur_pg: PlacementGroup | None) -> int:
11 |     """
12 |     Estimates the available CPU parallelism for this Dataset in the cluster.
13 | 
14 |     If we aren't in a placement group, this is trivially the number of CPUs in the
15 |     cluster. Otherwise, we try to calculate how large the placement group is relative
16 |     to the size of the cluster.
17 | 
18 |     Args:
19 |         cur_pg: The current placement group, if any.
20 |     """
21 |     cluster_cpus = int(ray.cluster_resources().get("CPU", 1))
22 |     cluster_gpus = int(ray.cluster_resources().get("GPU", 0))
23 | 
24 |     # If we're in a placement group, we shouldn't assume the entire cluster's
25 |     # resources are available for us to use. Estimate an upper bound on what's
26 |     # reasonable to assume is available for datasets to use.
27 |     if cur_pg:
28 |         pg_cpus = 0
29 |         for bundle in cur_pg.bundle_specs:
30 |             # Calculate the proportion of the cluster this placement group "takes up".
31 |             # Then scale our cluster_cpus proportionally to avoid over-parallelizing
32 |             # if there are many parallel Tune trials using the cluster.
33 |             cpu_fraction = bundle.get("CPU", 0) / max(1, cluster_cpus)
34 |             gpu_fraction = bundle.get("GPU", 0) / max(1, cluster_gpus)
35 |             max_fraction = max(cpu_fraction, gpu_fraction)
36 |             # Over-parallelize by up to a factor of 2, but no more than that. It's
37 |             # preferable to over-estimate than under-estimate.
38 |             pg_cpus += 2 * int(max_fraction * cluster_cpus)
39 | 
40 |         return min(cluster_cpus, pg_cpus)
41 | 
42 |     return cluster_cpus
43 | 
44 | 
45 | def _estimate_available_parallelism() -> int:
46 |     """
47 |     Estimates the available CPU parallelism for this Dataset in the cluster.
48 | 
49 |     If we are currently in a placement group, take that into account.
50 |     """
51 |     cur_pg = ray.util.get_current_placement_group()
52 |     return _estimate_avail_cpus(cur_pg)
53 | 
54 | 
55 | def ensure_worker_count(use_threads: bool | int = True) -> int:
56 |     if type(use_threads) == int:  # noqa: E721
57 |         if use_threads < 1:
58 |             return 1
59 |         return use_threads
60 | 
61 |     if use_threads is False:
62 |         return 1
63 | 
64 |     parallelism = _estimate_available_parallelism()
65 |     return max(parallelism, 1)
66 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/__init__.py:
--------------------------------------------------------------------------------
 1 | """Ray Datasources Module."""
 2 | 
 3 | from awswrangler.distributed.ray.datasources.arrow_csv_datasink import ArrowCSVDatasink
 4 | from awswrangler.distributed.ray.datasources.arrow_csv_datasource import ArrowCSVDatasource
 5 | from awswrangler.distributed.ray.datasources.arrow_json_datasource import ArrowJSONDatasource
 6 | from awswrangler.distributed.ray.datasources.arrow_orc_datasink import ArrowORCDatasink
 7 | from awswrangler.distributed.ray.datasources.arrow_orc_datasource import ArrowORCDatasource
 8 | from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource
 9 | from awswrangler.distributed.ray.datasources.arrow_parquet_datasink import ArrowParquetDatasink
10 | from awswrangler.distributed.ray.datasources.arrow_parquet_datasource import ArrowParquetDatasource
11 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
12 | from awswrangler.distributed.ray.datasources.pandas_text_datasink import PandasCSVDatasink, PandasJSONDatasink
13 | from awswrangler.distributed.ray.datasources.pandas_text_datasource import (
14 |     PandasCSVDataSource,
15 |     PandasFWFDataSource,
16 |     PandasJSONDatasource,
17 |     PandasTextDatasource,
18 | )
19 | 
20 | __all__ = [
21 |     "ArrowCSVDatasink",
22 |     "ArrowORCDatasink",
23 |     "ArrowParquetDatasink",
24 |     "ArrowCSVDatasource",
25 |     "ArrowJSONDatasource",
26 |     "ArrowORCDatasource",
27 |     "ArrowParquetBaseDatasource",
28 |     "ArrowParquetDatasource",
29 |     "PandasCSVDataSource",
30 |     "PandasFWFDataSource",
31 |     "PandasJSONDatasource",
32 |     "PandasTextDatasource",
33 |     "PandasCSVDatasink",
34 |     "PandasJSONDatasink",
35 |     "_BlockFileDatasink",
36 | ]
37 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_csv_datasink.py:
--------------------------------------------------------------------------------
 1 | """Ray PandasTextDatasink Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import io
 6 | import logging
 7 | from typing import Any
 8 | 
 9 | from pyarrow import csv
10 | from ray.data.block import BlockAccessor
11 | from ray.data.datasource.filename_provider import FilenameProvider
12 | 
13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
14 | 
15 | _logger: logging.Logger = logging.getLogger(__name__)
16 | 
17 | 
18 | class ArrowCSVDatasink(_BlockFileDatasink):
19 |     """A datasink that writes CSV files using Arrow."""
20 | 
21 |     def __init__(
22 |         self,
23 |         path: str,
24 |         *,
25 |         filename_provider: FilenameProvider | None = None,
26 |         dataset_uuid: str | None = None,
27 |         open_s3_object_args: dict[str, Any] | None = None,
28 |         pandas_kwargs: dict[str, Any] | None = None,
29 |         write_options: dict[str, Any] | None = None,
30 |         **write_args: Any,
31 |     ):
32 |         super().__init__(
33 |             path,
34 |             file_format="csv",
35 |             filename_provider=filename_provider,
36 |             dataset_uuid=dataset_uuid,
37 |             open_s3_object_args=open_s3_object_args,
38 |             pandas_kwargs=pandas_kwargs,
39 |             **write_args,
40 |         )
41 | 
42 |         self.write_options = write_options or {}
43 | 
44 |     def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None:
45 |         """
46 |         Write a block of data to a file.
47 | 
48 |         Parameters
49 |         ----------
50 |         block : BlockAccessor
51 |         file : io.TextIOWrapper
52 |         """
53 |         csv.write_csv(block.to_arrow(), file, csv.WriteOptions(**self.write_options))
54 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_csv_datasource.py:
--------------------------------------------------------------------------------
 1 | """Ray ArrowCSVDatasource Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any, Iterator
 6 | 
 7 | import pyarrow as pa
 8 | from pyarrow import csv
 9 | from ray.data.datasource.file_based_datasource import FileBasedDatasource
10 | 
11 | from awswrangler._arrow import _add_table_partitions
12 | 
13 | 
14 | class ArrowCSVDatasource(FileBasedDatasource):
15 |     """CSV datasource, for reading CSV files using PyArrow."""
16 | 
17 |     _FILE_EXTENSIONS = ["csv"]
18 | 
19 |     def __init__(
20 |         self,
21 |         paths: str | list[str],
22 |         dataset: bool,
23 |         path_root: str,
24 |         version_ids: dict[str, str] | None = None,
25 |         s3_additional_kwargs: dict[str, str] | None = None,
26 |         pandas_kwargs: dict[str, Any] | None = None,
27 |         arrow_csv_args: dict[str, Any] | None = None,
28 |         **file_based_datasource_kwargs: Any,
29 |     ):
30 |         from pyarrow import csv
31 | 
32 |         super().__init__(paths, **file_based_datasource_kwargs)
33 | 
34 |         self.dataset = dataset
35 |         self.path_root = path_root
36 | 
37 |         if arrow_csv_args is None:
38 |             arrow_csv_args = {}
39 | 
40 |         self.read_options = arrow_csv_args.pop("read_options", csv.ReadOptions(use_threads=False))
41 |         self.parse_options = arrow_csv_args.pop("parse_options", csv.ParseOptions())
42 |         self.convert_options = arrow_csv_args.get("convert_options", csv.ConvertOptions())
43 |         self.arrow_csv_args = arrow_csv_args
44 | 
45 |     def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
46 |         reader = csv.open_csv(
47 |             f,
48 |             read_options=self.read_options,
49 |             parse_options=self.parse_options,
50 |             convert_options=self.convert_options,
51 |         )
52 | 
53 |         schema = None
54 |         while True:
55 |             try:
56 |                 batch = reader.read_next_batch()
57 |                 table = pa.Table.from_batches([batch], schema=schema)
58 |                 if schema is None:
59 |                     schema = table.schema
60 | 
61 |                 if self.dataset:
62 |                     table = _add_table_partitions(
63 |                         table=table,
64 |                         path=f"s3://{path}",
65 |                         path_root=self.path_root,
66 |                     )
67 | 
68 |                 yield table
69 | 
70 |             except StopIteration:
71 |                 return
72 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_json_datasource.py:
--------------------------------------------------------------------------------
 1 | """Ray ArrowCSVDatasource Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any, Iterator
 6 | 
 7 | import pyarrow as pa
 8 | from pyarrow import json
 9 | from ray.data.datasource.file_based_datasource import FileBasedDatasource
10 | 
11 | from awswrangler._arrow import _add_table_partitions
12 | 
13 | 
14 | class ArrowJSONDatasource(FileBasedDatasource):
15 |     """JSON datasource, for reading JSON files using PyArrow."""
16 | 
17 |     _FILE_EXTENSIONS = ["json"]
18 | 
19 |     def __init__(
20 |         self,
21 |         paths: str | list[str],
22 |         dataset: bool,
23 |         path_root: str,
24 |         version_ids: dict[str, str] | None = None,
25 |         s3_additional_kwargs: dict[str, str] | None = None,
26 |         pandas_kwargs: dict[str, Any] | None = None,
27 |         arrow_json_args: dict[str, Any] | None = None,
28 |         **file_based_datasource_kwargs: Any,
29 |     ):
30 |         super().__init__(paths, **file_based_datasource_kwargs)
31 | 
32 |         self.dataset = dataset
33 |         self.path_root = path_root
34 | 
35 |         if arrow_json_args is None:
36 |             arrow_json_args = {}
37 | 
38 |         self.read_options = json.ReadOptions(arrow_json_args.pop("read_options", dict(use_threads=False)))
39 |         self.parse_options = json.ParseOptions(arrow_json_args.pop("parse_options", {}))
40 |         self.arrow_json_args = arrow_json_args
41 | 
42 |     def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
43 |         table = json.read_json(f, read_options=self.read_options, parse_options=self.parse_options)
44 | 
45 |         if self.dataset:
46 |             table = _add_table_partitions(
47 |                 table=table,
48 |                 path=f"s3://{path}",
49 |                 path_root=self.path_root,
50 |             )
51 | 
52 |         return [table]  # type: ignore[return-value]
53 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_orc_datasink.py:
--------------------------------------------------------------------------------
 1 | """Ray PandasTextDatasink Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import io
 6 | import logging
 7 | from typing import Any
 8 | 
 9 | import pyarrow as pa
10 | from ray.data.block import BlockAccessor
11 | from ray.data.datasource.filename_provider import FilenameProvider
12 | 
13 | from awswrangler._arrow import _df_to_table
14 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
15 | 
16 | _logger: logging.Logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class ArrowORCDatasink(_BlockFileDatasink):
20 |     """A datasink that writes CSV files using Arrow."""
21 | 
22 |     def __init__(
23 |         self,
24 |         path: str,
25 |         *,
26 |         filename_provider: FilenameProvider | None = None,
27 |         dataset_uuid: str | None = None,
28 |         open_s3_object_args: dict[str, Any] | None = None,
29 |         pandas_kwargs: dict[str, Any] | None = None,
30 |         schema: pa.Schema | None = None,
31 |         index: bool = False,
32 |         dtype: dict[str, str] | None = None,
33 |         pyarrow_additional_kwargs: dict[str, Any] | None = None,
34 |         **write_args: Any,
35 |     ):
36 |         super().__init__(
37 |             path,
38 |             file_format="orc",
39 |             filename_provider=filename_provider,
40 |             dataset_uuid=dataset_uuid,
41 |             open_s3_object_args=open_s3_object_args,
42 |             pandas_kwargs=pandas_kwargs,
43 |             **write_args,
44 |         )
45 | 
46 |         self.pyarrow_additional_kwargs = pyarrow_additional_kwargs or {}
47 |         self.schema = schema
48 |         self.index = index
49 |         self.dtype = dtype
50 | 
51 |     def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None:
52 |         """
53 |         Write a block of data to a file.
54 | 
55 |         Parameters
56 |         ----------
57 |         file : io.TextIOWrapper
58 |         block : BlockAccessor
59 |         """
60 |         from pyarrow import orc
61 | 
62 |         compression: str = self.write_args.get("compression", None) or "UNCOMPRESSED"
63 | 
64 |         orc.write_table(
65 |             _df_to_table(block.to_pandas(), schema=self.schema, index=self.index, dtype=self.dtype),
66 |             file,
67 |             compression=compression,
68 |             **self.pyarrow_additional_kwargs,
69 |         )
70 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_orc_datasource.py:
--------------------------------------------------------------------------------
 1 | """Ray ArrowCSVDatasource Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any, Iterator
 6 | 
 7 | import pyarrow as pa
 8 | from ray.data.datasource.file_based_datasource import FileBasedDatasource
 9 | 
10 | from awswrangler._arrow import _add_table_partitions
11 | 
12 | 
13 | class ArrowORCDatasource(FileBasedDatasource):
14 |     """ORC datasource, for reading and writing ORC files using PyArrow."""
15 | 
16 |     _FILE_EXTENSIONS = ["orc"]
17 | 
18 |     def __init__(
19 |         self,
20 |         paths: str | list[str],
21 |         dataset: bool,
22 |         path_root: str | None,
23 |         use_threads: bool | int,
24 |         schema: pa.Schema,
25 |         arrow_orc_args: dict[str, Any] | None = None,
26 |         **file_based_datasource_kwargs: Any,
27 |     ):
28 |         super().__init__(paths, **file_based_datasource_kwargs)
29 | 
30 |         self.dataset = dataset
31 |         self.path_root = path_root
32 | 
33 |         if arrow_orc_args is None:
34 |             arrow_orc_args = {}
35 | 
36 |         self.columns: list[str] | None = arrow_orc_args.get("columns", None)
37 |         self.arrow_orc_args = arrow_orc_args
38 | 
39 |     def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
40 |         from pyarrow import orc
41 | 
42 |         table: pa.Table = orc.read_table(f, columns=self.columns)
43 | 
44 |         if self.dataset:
45 |             table = _add_table_partitions(
46 |                 table=table,
47 |                 path=f"s3://{path}",
48 |                 path_root=self.path_root,
49 |             )
50 | 
51 |         return [table]  # type: ignore[return-value]
52 | 
53 |     def _open_input_source(
54 |         self,
55 |         filesystem: pa.fs.FileSystem,
56 |         path: str,
57 |         **open_args: Any,
58 |     ) -> pa.NativeFile:
59 |         return filesystem.open_input_file(path, **open_args)
60 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py:
--------------------------------------------------------------------------------
 1 | """Ray ParquetBaseDatasource Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any, Iterator
 6 | 
 7 | # fs required to implicitly trigger S3 subsystem initialization
 8 | import pyarrow as pa
 9 | import pyarrow.fs
10 | import pyarrow.parquet as pq
11 | from ray.data.datasource.file_based_datasource import FileBasedDatasource
12 | 
13 | from awswrangler._arrow import _add_table_partitions
14 | 
15 | 
16 | class ArrowParquetBaseDatasource(FileBasedDatasource):
17 |     """Parquet datasource, for reading Parquet files."""
18 | 
19 |     _FILE_EXTENSIONS = ["parquet"]
20 | 
21 |     def __init__(
22 |         self,
23 |         paths: str | list[str],
24 |         path_root: str,
25 |         arrow_parquet_args: dict[str, Any] | None = None,
26 |         **file_based_datasource_kwargs: Any,
27 |     ):
28 |         super().__init__(paths, **file_based_datasource_kwargs)
29 | 
30 |         if arrow_parquet_args is None:
31 |             arrow_parquet_args = {}
32 | 
33 |         self.path_root = path_root
34 |         self.arrow_parquet_args = arrow_parquet_args
35 | 
36 |     def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]:
37 |         arrow_parquet_args = self.arrow_parquet_args
38 | 
39 |         use_threads: bool = arrow_parquet_args.get("use_threads", False)
40 |         columns: list[str] | None = arrow_parquet_args.get("columns", None)
41 | 
42 |         dataset_kwargs = arrow_parquet_args.get("dataset_kwargs", {})
43 |         coerce_int96_timestamp_unit: str | None = dataset_kwargs.get("coerce_int96_timestamp_unit", None)
44 |         decryption_properties = dataset_kwargs.get("decryption_properties", None)
45 | 
46 |         table = pq.read_table(
47 |             f,
48 |             use_threads=use_threads,
49 |             columns=columns,
50 |             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
51 |             decryption_properties=decryption_properties,
52 |         )
53 | 
54 |         table = _add_table_partitions(
55 |             table=table,
56 |             path=f"s3://{path}",
57 |             path_root=self.path_root,
58 |         )
59 | 
60 |         return [table]  # type: ignore[return-value]
61 | 
62 |     def _open_input_source(
63 |         self,
64 |         filesystem: pyarrow.fs.FileSystem,
65 |         path: str,
66 |         **open_args: Any,
67 |     ) -> pa.NativeFile:
68 |         # Parquet requires `open_input_file` due to random access reads
69 |         return filesystem.open_input_file(path, **open_args)
70 | 
71 |     def get_name(self) -> str:
72 |         """Return a human-readable name for this datasource."""
73 |         return "ParquetBulk"
74 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/arrow_parquet_datasink.py:
--------------------------------------------------------------------------------
 1 | """Ray ArrowParquetDatasink Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | import pyarrow as pa
 9 | from ray.data.block import BlockAccessor
10 | from ray.data.datasource.filename_provider import FilenameProvider
11 | 
12 | from awswrangler._arrow import _df_to_table
13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
14 | from awswrangler.distributed.ray.datasources.filename_provider import _DefaultFilenameProvider
15 | from awswrangler.s3._write import _COMPRESSION_2_EXT
16 | 
17 | _logger: logging.Logger = logging.getLogger(__name__)
18 | 
19 | 
20 | class _ParquetFilenameProvider(_DefaultFilenameProvider):
21 |     """Parquet filename provider where compression comes before file format."""
22 | 
23 |     def _generate_filename(self, file_id: str) -> str:
24 |         filename = ""
25 |         if self._dataset_uuid is not None:
26 |             filename += f"{self._dataset_uuid}_"
27 |         filename += f"{file_id}"
28 |         if self._bucket_id is not None:
29 |             filename += f"_bucket-{self._bucket_id:05d}"
30 |         filename += f"{_COMPRESSION_2_EXT.get(self._compression)}.{self._file_format}"
31 |         return filename
32 | 
33 | 
34 | class ArrowParquetDatasink(_BlockFileDatasink):
35 |     """A datasink that writes Parquet files."""
36 | 
37 |     def __init__(
38 |         self,
39 |         path: str,
40 |         *,
41 |         filename_provider: FilenameProvider | None = None,
42 |         dataset_uuid: str | None = None,
43 |         open_s3_object_args: dict[str, Any] | None = None,
44 |         pandas_kwargs: dict[str, Any] | None = None,
45 |         schema: pa.Schema | None = None,
46 |         index: bool = False,
47 |         dtype: dict[str, str] | None = None,
48 |         pyarrow_additional_kwargs: dict[str, Any] | None = None,
49 |         compression: str | None = None,
50 |         **write_args: Any,
51 |     ):
52 |         file_format = "parquet"
53 |         write_args = write_args or {}
54 | 
55 |         if filename_provider is None:
56 |             bucket_id = write_args.get("bucket_id", None)
57 | 
58 |             filename_provider = _ParquetFilenameProvider(
59 |                 dataset_uuid=dataset_uuid,
60 |                 file_format=file_format,
61 |                 compression=compression,
62 |                 bucket_id=bucket_id,
63 |             )
64 | 
65 |         super().__init__(
66 |             path,
67 |             file_format=file_format,
68 |             filename_provider=filename_provider,
69 |             dataset_uuid=dataset_uuid,
70 |             open_s3_object_args=open_s3_object_args,
71 |             pandas_kwargs=pandas_kwargs,
72 |             **write_args,
73 |         )
74 | 
75 |         self.pyarrow_additional_kwargs = pyarrow_additional_kwargs or {}
76 |         self.schema = schema
77 |         self.index = index
78 |         self.dtype = dtype
79 | 
80 |     def write_block(self, file: pa.NativeFile, block: BlockAccessor) -> None:
81 |         """
82 |         Write a block of data to a file.
83 | 
84 |         Parameters
85 |         ----------
86 |         file : pa.NativeFile
87 |         block : BlockAccessor
88 |         """
89 |         pa.parquet.write_table(
90 |             _df_to_table(block.to_pandas(), schema=self.schema, index=self.index, dtype=self.dtype),
91 |             file,
92 |             **self.pyarrow_additional_kwargs,
93 |         )
94 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/filename_provider.py:
--------------------------------------------------------------------------------
 1 | """Ray DefaultFilenameProvider Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Any
 6 | 
 7 | from ray.data.block import Block
 8 | from ray.data.datasource.filename_provider import FilenameProvider
 9 | 
10 | from awswrangler.s3._write import _COMPRESSION_2_EXT
11 | 
12 | 
13 | class _DefaultFilenameProvider(FilenameProvider):
14 |     def __init__(
15 |         self,
16 |         file_format: str,
17 |         dataset_uuid: str | None = None,
18 |         compression: str | None = None,
19 |         bucket_id: int | None = None,
20 |     ):
21 |         self._dataset_uuid = dataset_uuid
22 |         self._file_format = file_format
23 |         self._compression = compression
24 |         self._bucket_id = bucket_id
25 | 
26 |     def get_filename_for_block(
27 |         self,
28 |         block: Block,
29 |         task_index: int,
30 |         block_index: int,
31 |     ) -> str:
32 |         file_id = f"{task_index:06}_{block_index:06}"
33 |         return self._generate_filename(file_id)
34 | 
35 |     def get_filename_for_row(self, row: dict[str, Any], task_index: int, block_index: int, row_index: int) -> str:
36 |         file_id = f"{task_index:06}_{block_index:06}_{row_index:06}"
37 |         return self._generate_filename(file_id)
38 | 
39 |     def _generate_filename(self, file_id: str) -> str:
40 |         filename = ""
41 |         if self._dataset_uuid is not None:
42 |             filename += f"{self._dataset_uuid}_"
43 |         filename += f"{file_id}"
44 |         if self._bucket_id is not None:
45 |             filename += f"_bucket-{self._bucket_id:05d}"
46 |         filename += f".{self._file_format}{_COMPRESSION_2_EXT.get(self._compression)}"
47 |         return filename
48 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/datasources/pandas_text_datasink.py:
--------------------------------------------------------------------------------
  1 | """Ray PandasTextDatasink Module."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import io
  6 | import logging
  7 | from typing import Any, Callable
  8 | 
  9 | import pandas as pd
 10 | from ray.data.block import BlockAccessor
 11 | from ray.data.datasource.filename_provider import FilenameProvider
 12 | 
 13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink
 14 | 
 15 | _logger: logging.Logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class _PandasTextDatasink(_BlockFileDatasink):
 19 |     """A datasink that writes text files using Pandas IO."""
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         path: str,
 24 |         file_format: str,
 25 |         write_text_func: Callable[..., None] | None,
 26 |         *,
 27 |         filename_provider: FilenameProvider | None = None,
 28 |         dataset_uuid: str | None = None,
 29 |         open_s3_object_args: dict[str, Any] | None = None,
 30 |         pandas_kwargs: dict[str, Any] | None = None,
 31 |         **write_args: Any,
 32 |     ):
 33 |         super().__init__(
 34 |             path,
 35 |             file_format=file_format,
 36 |             filename_provider=filename_provider,
 37 |             dataset_uuid=dataset_uuid,
 38 |             open_s3_object_args=open_s3_object_args,
 39 |             pandas_kwargs=pandas_kwargs,
 40 |             **write_args,
 41 |         )
 42 | 
 43 |         self.write_text_func = write_text_func
 44 | 
 45 |     def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None:
 46 |         """
 47 |         Write a block of data to a file.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         block : BlockAccessor
 52 |         file : pa.NativeFile
 53 |         """
 54 |         write_text_func = self.write_text_func
 55 | 
 56 |         write_text_func(block.to_pandas(), file, **self.pandas_kwargs)  # type: ignore[misc]
 57 | 
 58 | 
 59 | class PandasCSVDatasink(_PandasTextDatasink):
 60 |     """A datasink that writes CSV files using Pandas IO."""
 61 | 
 62 |     def __init__(
 63 |         self,
 64 |         path: str,
 65 |         *,
 66 |         filename_provider: FilenameProvider | None = None,
 67 |         dataset_uuid: str | None = None,
 68 |         open_s3_object_args: dict[str, Any] | None = None,
 69 |         pandas_kwargs: dict[str, Any] | None = None,
 70 |         **write_args: Any,
 71 |     ):
 72 |         super().__init__(
 73 |             path,
 74 |             "csv",
 75 |             pd.DataFrame.to_csv,
 76 |             filename_provider=filename_provider,
 77 |             dataset_uuid=dataset_uuid,
 78 |             open_s3_object_args=open_s3_object_args,
 79 |             pandas_kwargs=pandas_kwargs,
 80 |             **write_args,
 81 |         )
 82 | 
 83 | 
 84 | class PandasJSONDatasink(_PandasTextDatasink):
 85 |     """A datasink that writes CSV files using Pandas IO."""
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         path: str,
 90 |         *,
 91 |         filename_provider: FilenameProvider | None = None,
 92 |         dataset_uuid: str | None = None,
 93 |         open_s3_object_args: dict[str, Any] | None = None,
 94 |         pandas_kwargs: dict[str, Any] | None = None,
 95 |         **write_args: Any,
 96 |     ):
 97 |         super().__init__(
 98 |             path,
 99 |             "json",
100 |             pd.DataFrame.to_json,
101 |             filename_provider=filename_provider,
102 |             dataset_uuid=dataset_uuid,
103 |             open_s3_object_args=open_s3_object_args,
104 |             pandas_kwargs=pandas_kwargs,
105 |             **write_args,
106 |         )
107 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/__init__.py:
--------------------------------------------------------------------------------
1 | """Ray Modin Module."""
2 | 
3 | from awswrangler.distributed.ray.modin._core import modin_repartition
4 | 
5 | __all__ = [
6 |     "modin_repartition",
7 | ]
8 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/_core.py:
--------------------------------------------------------------------------------
 1 | """Modin on Ray Core module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from functools import wraps
 7 | from typing import Any, Callable, TypeVar
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | from modin.distributed.dataframe.pandas import from_partitions, unwrap_partitions
12 | from modin.pandas import DataFrame as ModinDataFrame
13 | 
14 | _logger: logging.Logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def _validate_partition_shape(df: pd.DataFrame) -> bool:
18 |     """
19 |     Validate if partitions of the data frame are partitioned along row axis.
20 | 
21 |     Parameters
22 |     ----------
23 |     df : pd.DataFrame
24 |         Modin data frame
25 | 
26 |     Returns
27 |     -------
28 |     bool
29 |     """
30 |     # Unwrap partitions as they are currently stored (axis=None)
31 |     partitions_shape = np.array(unwrap_partitions(df)).shape
32 |     return partitions_shape[1] == 1  # type: ignore[no-any-return,unused-ignore]
33 | 
34 | 
35 | FunctionType = TypeVar("FunctionType", bound=Callable[..., Any])
36 | 
37 | 
38 | def modin_repartition(function: FunctionType) -> FunctionType:
39 |     """
40 |     Decorate callable to repartition Modin data frame.
41 | 
42 |     By default, repartition along row (axis=0) axis.
43 |     This avoids a situation where columns are split along multiple blocks.
44 | 
45 |     Parameters
46 |     ----------
47 |     function : Callable[..., Any]
48 |         Callable as input to ray.remote
49 | 
50 |     Returns
51 |     -------
52 |     Callable[..., Any]
53 |     """
54 |     # Access the source function if it exists
55 |     function = getattr(function, "_source_func", function)
56 | 
57 |     @wraps(function)
58 |     def wrapper(
59 |         df: pd.DataFrame,
60 |         *args: Any,
61 |         axis: int | None = None,
62 |         row_lengths: int | None = None,
63 |         validate_partitions: bool = True,
64 |         **kwargs: Any,
65 |     ) -> Any:
66 |         # Validate partitions and repartition Modin data frame along row (axis=0) axis
67 |         # to avoid a situation where columns are split along multiple blocks
68 |         if isinstance(df, ModinDataFrame):
69 |             if validate_partitions and not _validate_partition_shape(df):
70 |                 _logger.warning(
71 |                     "Partitions of this data frame are detected to be split along column axis. "
72 |                     "The DataFrame will be automatically repartitioned along row axis to ensure "
73 |                     "each partition can be processed independently."
74 |                 )
75 |                 axis = 0
76 |             if axis is not None:
77 |                 df = from_partitions(unwrap_partitions(df, axis=axis), axis=axis, row_lengths=row_lengths)
78 |         return function(df, *args, **kwargs)
79 | 
80 |     return wrapper  # type: ignore[return-value]
81 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/_data_types.py:
--------------------------------------------------------------------------------
 1 | """Internal (private) Data Types Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import modin.pandas as pd
 6 | import pyarrow as pa
 7 | 
 8 | from awswrangler._data_types import pyarrow_types_from_pandas
 9 | from awswrangler.distributed.ray import ray_get, ray_remote
10 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df
11 | 
12 | 
13 | def pyarrow_types_from_pandas_distributed(
14 |     df: pd.DataFrame, index: bool, ignore_cols: list[str] | None = None, index_left: bool = False
15 | ) -> dict[str, pa.DataType]:
16 |     """Extract the related Pyarrow data types from a pandas DataFrame."""
17 |     func = ray_remote()(pyarrow_types_from_pandas)
18 |     first_block_object_ref = next(_ray_dataset_from_df(df).iter_internal_ref_bundles()).block_refs[0]
19 |     return ray_get(  # type: ignore[no-any-return]
20 |         func(
21 |             df=first_block_object_ref,
22 |             index=index,
23 |             ignore_cols=ignore_cols,
24 |             index_left=index_left,
25 |         )
26 |     )
27 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/s3/__init__.py:
--------------------------------------------------------------------------------
1 | """Ray Modin S3 Module."""
2 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/s3/_read_orc.py:
--------------------------------------------------------------------------------
 1 | """Modin on Ray S3 read text module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import TYPE_CHECKING, Any
 7 | 
 8 | import modin.pandas as pd
 9 | import pyarrow as pa
10 | from ray.data import read_datasource
11 | from ray.data.datasource import FastFileMetadataProvider
12 | 
13 | from awswrangler import _data_types
14 | from awswrangler.distributed.ray.datasources import ArrowORCDatasource
15 | from awswrangler.distributed.ray.modin._utils import _to_modin
16 | 
17 | if TYPE_CHECKING:
18 |     from mypy_boto3_s3 import S3Client
19 | 
20 | _logger: logging.Logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def _read_orc_distributed(
24 |     paths: list[str],
25 |     path_root: str | None,
26 |     schema: pa.schema | None,
27 |     columns: list[str] | None,
28 |     use_threads: bool | int,
29 |     override_num_blocks: int,
30 |     version_ids: dict[str, str] | None,
31 |     s3_client: "S3Client" | None,
32 |     s3_additional_kwargs: dict[str, Any] | None,
33 |     arrow_kwargs: dict[str, Any],
34 | ) -> pd.DataFrame:
35 |     datasource = ArrowORCDatasource(
36 |         paths=paths,
37 |         dataset=True,
38 |         path_root=path_root,
39 |         use_threads=use_threads,
40 |         schema=schema,
41 |         arrow_orc_args={"columns": columns},
42 |         meta_provider=FastFileMetadataProvider(),
43 |     )
44 |     ray_dataset = read_datasource(
45 |         datasource,
46 |         override_num_blocks=override_num_blocks,
47 |     )
48 |     to_pandas_kwargs = _data_types.pyarrow2pandas_defaults(
49 |         use_threads=use_threads,
50 |         kwargs=arrow_kwargs,
51 |     )
52 |     return _to_modin(dataset=ray_dataset, to_pandas_kwargs=to_pandas_kwargs)
53 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/s3/_read_parquet.py:
--------------------------------------------------------------------------------
 1 | """Modin on Ray S3 read parquet module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING, Any
 6 | 
 7 | import modin.pandas as pd
 8 | import pyarrow as pa
 9 | from ray.data import read_datasource
10 | from ray.data.datasource import FastFileMetadataProvider
11 | 
12 | from awswrangler.distributed.ray.datasources import ArrowParquetBaseDatasource, ArrowParquetDatasource
13 | from awswrangler.distributed.ray.modin._utils import _to_modin
14 | 
15 | if TYPE_CHECKING:
16 |     from mypy_boto3_s3 import S3Client
17 | 
18 | 
19 | def _resolve_datasource_parameters(bulk_read: bool, *args: Any, **kwargs: Any) -> dict[str, Any]:
20 |     if bulk_read:
21 |         return {
22 |             "datasource": ArrowParquetBaseDatasource(*args, **kwargs),
23 |             "meta_provider": FastFileMetadataProvider(),
24 |         }
25 |     return {
26 |         "datasource": ArrowParquetDatasource(*args, **kwargs),
27 |     }
28 | 
29 | 
30 | def _read_parquet_distributed(
31 |     paths: list[str],
32 |     path_root: str | None,
33 |     schema: pa.schema | None,
34 |     columns: list[str] | None,
35 |     coerce_int96_timestamp_unit: str | None,
36 |     use_threads: bool | int,
37 |     override_num_blocks: int,
38 |     version_ids: dict[str, str] | None,
39 |     s3_client: "S3Client" | None,
40 |     s3_additional_kwargs: dict[str, Any] | None,
41 |     arrow_kwargs: dict[str, Any],
42 |     bulk_read: bool,
43 |     decryption_properties: pa.parquet.encryption.DecryptionConfiguration | None = None,
44 | ) -> pd.DataFrame:
45 |     dataset_kwargs = {}
46 |     if coerce_int96_timestamp_unit:
47 |         dataset_kwargs["coerce_int96_timestamp_unit"] = coerce_int96_timestamp_unit
48 |     if decryption_properties:
49 |         dataset_kwargs["decryption_properties"] = decryption_properties
50 | 
51 |     dataset = read_datasource(
52 |         **_resolve_datasource_parameters(
53 |             bulk_read,
54 |             paths=paths,
55 |             path_root=path_root,
56 |             arrow_parquet_args={
57 |                 "use_threads": use_threads,
58 |                 "schema": schema,
59 |                 "columns": columns,
60 |                 "dataset_kwargs": dataset_kwargs,
61 |             },
62 |         ),
63 |         override_num_blocks=override_num_blocks,
64 |     )
65 |     return _to_modin(
66 |         dataset=dataset,
67 |         to_pandas_kwargs=arrow_kwargs,
68 |         ignore_index=arrow_kwargs.get("ignore_metadata"),
69 |     )
70 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/s3/_write_orc.py:
--------------------------------------------------------------------------------
 1 | """Modin on Ray S3 write parquet module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | import math
 7 | from typing import TYPE_CHECKING, Any, cast
 8 | 
 9 | import modin.pandas as pd
10 | import pyarrow as pa
11 | 
12 | from awswrangler import exceptions
13 | from awswrangler.distributed.ray.datasources import ArrowORCDatasink
14 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df
15 | from awswrangler.typing import ArrowEncryptionConfiguration
16 | 
17 | if TYPE_CHECKING:
18 |     from mypy_boto3_s3 import S3Client
19 | 
20 | _logger: logging.Logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def _to_orc_distributed(
24 |     df: pd.DataFrame,
25 |     schema: pa.Schema,
26 |     index: bool,
27 |     compression: str | None,
28 |     compression_ext: str,
29 |     pyarrow_additional_kwargs: dict[str, Any],
30 |     cpus: int,
31 |     dtype: dict[str, str],
32 |     s3_client: "S3Client" | None,
33 |     s3_additional_kwargs: dict[str, str] | None,
34 |     use_threads: bool | int,
35 |     path: str | None = None,
36 |     path_root: str | None = None,
37 |     filename_prefix: str | None = None,
38 |     max_rows_by_file: int | None = 0,
39 |     bucketing: bool = False,
40 |     encryption_configuration: ArrowEncryptionConfiguration | None = None,
41 | ) -> list[str]:
42 |     # Create Ray Dataset
43 |     ds = _ray_dataset_from_df(df)
44 | 
45 |     if df.index.name is not None:
46 |         raise exceptions.InvalidArgumentCombination("Orc does not serialize index metadata on a default index.")
47 | 
48 |     # Repartition into a single block if or writing into a single key or if bucketing is enabled
49 |     if ds.count() > 0 and (path or bucketing) and not max_rows_by_file:
50 |         _logger.warning(
51 |             "Repartitioning frame to single partition as a strict path was defined: %s. "
52 |             "This operation is inefficient for large datasets.",
53 |             path,
54 |         )
55 |         ds = ds.repartition(1)
56 | 
57 |     # Repartition by max_rows_by_file
58 |     elif max_rows_by_file and (max_rows_by_file > 0):
59 |         ds = ds.repartition(math.ceil(ds.count() / max_rows_by_file))
60 | 
61 |         if path and not path.endswith("/"):
62 |             path = f"{path}/"
63 | 
64 |     datasink = ArrowORCDatasink(
65 |         path=cast(str, path or path_root),
66 |         dataset_uuid=filename_prefix,
67 |         open_s3_object_args={
68 |             "s3_additional_kwargs": s3_additional_kwargs,
69 |         },
70 |         index=index,
71 |         dtype=dtype,
72 |         compression=compression,
73 |         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
74 |         schema=schema,
75 |         bucket_id=df.name if bucketing else None,
76 |     )
77 |     ds.write_datasink(datasink)
78 |     return datasink.get_write_paths()
79 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/modin/s3/_write_parquet.py:
--------------------------------------------------------------------------------
 1 | """Modin on Ray S3 write parquet module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | import math
 7 | from typing import TYPE_CHECKING, Any, cast
 8 | 
 9 | import modin.pandas as pd
10 | import pyarrow as pa
11 | 
12 | from awswrangler import exceptions
13 | from awswrangler.distributed.ray.datasources import ArrowParquetDatasink
14 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df
15 | from awswrangler.typing import ArrowEncryptionConfiguration
16 | 
17 | if TYPE_CHECKING:
18 |     from mypy_boto3_s3 import S3Client
19 | 
20 | _logger: logging.Logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def _to_parquet_distributed(
24 |     df: pd.DataFrame,
25 |     schema: "pa.Schema",
26 |     index: bool,
27 |     compression: str | None,
28 |     compression_ext: str,
29 |     pyarrow_additional_kwargs: dict[str, Any] | None,
30 |     cpus: int,
31 |     dtype: dict[str, str],
32 |     s3_client: "S3Client" | None,
33 |     s3_additional_kwargs: dict[str, str] | None,
34 |     use_threads: bool | int,
35 |     path: str | None = None,
36 |     path_root: str | None = None,
37 |     filename_prefix: str | None = "",
38 |     max_rows_by_file: int | None = 0,
39 |     bucketing: bool = False,
40 |     encryption_configuration: ArrowEncryptionConfiguration | None = None,
41 | ) -> list[str]:
42 |     # Create Ray Dataset
43 |     ds = _ray_dataset_from_df(df)
44 |     # Repartition into a single block if or writing into a single key or if bucketing is enabled
45 |     if ds.count() > 0 and (path or bucketing) and not max_rows_by_file:
46 |         _logger.warning(
47 |             "Repartitioning frame to single partition as a strict path was defined: %s. "
48 |             "This operation is inefficient for large datasets.",
49 |             path,
50 |         )
51 | 
52 |         if index and df.index.name:
53 |             raise exceptions.InvalidArgumentCombination(
54 |                 "Cannot write a named index when repartitioning to a single file"
55 |             )
56 | 
57 |         ds = ds.repartition(1)
58 |     # Repartition by max_rows_by_file
59 |     elif max_rows_by_file and (max_rows_by_file > 0):
60 |         if index:
61 |             raise exceptions.InvalidArgumentCombination(
62 |                 "Cannot write indexed file when `max_rows_by_file` is specified"
63 |             )
64 |         ds = ds.repartition(math.ceil(ds.count() / max_rows_by_file))
65 | 
66 |         if path and not path.endswith("/"):
67 |             path = f"{path}/"
68 | 
69 |     datasink = ArrowParquetDatasink(
70 |         path=cast(str, path or path_root),
71 |         dataset_uuid=filename_prefix,
72 |         index=index,
73 |         dtype=dtype,
74 |         compression=compression,
75 |         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
76 |         schema=schema,
77 |         bucket_id=df.name if bucketing else None,
78 |     )
79 |     ds.write_datasink(datasink)
80 |     return datasink.get_write_paths()
81 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/s3/__init__.py:
--------------------------------------------------------------------------------
1 | """Ray S3 Module."""
2 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/s3/_list.py:
--------------------------------------------------------------------------------
 1 | """Ray S3 List module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import datetime
 6 | import fnmatch
 7 | import logging
 8 | from typing import TYPE_CHECKING, Any, Iterator
 9 | 
10 | from pyarrow.fs import FileSelector, FileType, _resolve_filesystem_and_path
11 | 
12 | if TYPE_CHECKING:
13 |     from mypy_boto3_s3 import S3Client
14 | 
15 | _logger: logging.Logger = logging.getLogger(__name__)
16 | 
17 | 
18 | def _list_objects_s3fs(
19 |     bucket: str,
20 |     pattern: str,
21 |     prefix: str,
22 |     s3_client: "S3Client",
23 |     delimiter: str | None,
24 |     s3_additional_kwargs: dict[str, Any] | None,
25 |     suffix: list[str] | None,
26 |     ignore_suffix: list[str] | None,
27 |     last_modified_begin: datetime.datetime | None,
28 |     last_modified_end: datetime.datetime | None,
29 |     ignore_empty: bool,
30 | ) -> Iterator[list[str]]:
31 |     """Expand the provided S3 directory path to a list of object paths."""
32 |     resolved_filesystem, resolved_path = _resolve_filesystem_and_path(f"s3://{bucket}/{prefix}", None)
33 |     paths: list[str] = []
34 | 
35 |     path_info = resolved_filesystem.get_file_info(resolved_path)
36 | 
37 |     if path_info.type in (FileType.File, FileType.Directory):
38 |         if path_info.type == FileType.File:
39 |             files = [path_info]
40 |             base_path = resolved_path
41 |         else:
42 |             selector = FileSelector(resolved_path, recursive=True)
43 |             files = resolved_filesystem.get_file_info(selector)
44 |             base_path = selector.base_dir
45 | 
46 |         for file_ in files:
47 |             if not file_.is_file:
48 |                 continue
49 |             if ignore_empty and file_.size == 0:
50 |                 continue
51 |             file_path = file_.path
52 |             if not file_path.startswith(base_path):
53 |                 continue
54 |             if (ignore_suffix is not None) and file_path.endswith(tuple(ignore_suffix)):
55 |                 continue
56 |             if (suffix is None) or file_path.endswith(tuple(suffix)):
57 |                 if last_modified_begin is not None:
58 |                     if file_.mtime < last_modified_begin:
59 |                         continue
60 |                 if last_modified_end is not None:
61 |                     if file_.mtime > last_modified_end:
62 |                         continue
63 |                 paths.append(f"s3://{file_path}")
64 | 
65 |             if prefix != pattern:
66 |                 paths = fnmatch.filter(paths, f"s3://{bucket}/{pattern}")
67 | 
68 |             if paths:
69 |                 yield paths
70 |             paths = []
71 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/s3/_read_orc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import pyarrow as pa
 6 | from pyarrow.fs import _resolve_filesystem_and_path
 7 | 
 8 | from awswrangler import _utils
 9 | from awswrangler.s3._read_orc import _pyarrow_orc_file_wrapper
10 | 
11 | if TYPE_CHECKING:
12 |     from mypy_boto3_s3 import S3Client
13 | 
14 | 
15 | @_utils.retry(ex=OSError)
16 | def _read_orc_metadata_file_distributed(
17 |     s3_client: "S3Client" | None,
18 |     path: str,
19 |     s3_additional_kwargs: dict[str, str] | None,
20 |     use_threads: bool | int,
21 |     version_id: str | None = None,
22 | ) -> pa.schema | None:
23 |     resolved_filesystem, resolved_path = _resolve_filesystem_and_path(path)
24 | 
25 |     with resolved_filesystem.open_input_file(resolved_path) as f:
26 |         orc_file = _pyarrow_orc_file_wrapper(
27 |             source=f,
28 |         )
29 | 
30 |         if orc_file:
31 |             return orc_file.schema
32 | 
33 |     return None
34 | 


--------------------------------------------------------------------------------
/awswrangler/distributed/ray/s3/_read_parquet.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import pyarrow as pa
 6 | from pyarrow.fs import _resolve_filesystem_and_path
 7 | 
 8 | from awswrangler import _utils
 9 | from awswrangler.s3._read_parquet import _pyarrow_parquet_file_wrapper
10 | 
11 | if TYPE_CHECKING:
12 |     from mypy_boto3_s3 import S3Client
13 | 
14 | 
15 | @_utils.retry(ex=OSError)
16 | def _read_parquet_metadata_file_distributed(
17 |     s3_client: "S3Client" | None,
18 |     path: str,
19 |     s3_additional_kwargs: dict[str, str] | None,
20 |     use_threads: bool | int,
21 |     version_id: str | None = None,
22 |     coerce_int96_timestamp_unit: str | None = None,
23 | ) -> pa.schema | None:
24 |     resolved_filesystem, resolved_path = _resolve_filesystem_and_path(path)
25 | 
26 |     with resolved_filesystem.open_input_file(resolved_path) as f:
27 |         pq_file = _pyarrow_parquet_file_wrapper(
28 |             source=f,
29 |             coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
30 |         )
31 | 
32 |         if pq_file:
33 |             return pq_file.schema.to_arrow_schema()
34 | 
35 |     return None
36 | 


--------------------------------------------------------------------------------
/awswrangler/dynamodb/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon DynamoDB Module."""
 2 | 
 3 | from awswrangler.dynamodb._delete import delete_items
 4 | from awswrangler.dynamodb._read import read_items, read_partiql_query
 5 | from awswrangler.dynamodb._utils import execute_statement, get_table
 6 | from awswrangler.dynamodb._write import put_csv, put_df, put_items, put_json
 7 | 
 8 | __all__ = [
 9 |     "delete_items",
10 |     "execute_statement",
11 |     "get_table",
12 |     "put_csv",
13 |     "put_df",
14 |     "put_items",
15 |     "put_json",
16 |     "read_partiql_query",
17 |     "read_items",
18 | ]
19 | 


--------------------------------------------------------------------------------
/awswrangler/dynamodb/_delete.py:
--------------------------------------------------------------------------------
 1 | """Amazon DynamoDB Delete Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | import boto3
 9 | from boto3.dynamodb.types import TypeSerializer
10 | 
11 | from awswrangler import _utils
12 | from awswrangler._config import apply_configs
13 | 
14 | from ._utils import _TableBatchWriter, _validate_items
15 | 
16 | _logger: logging.Logger = logging.getLogger(__name__)
17 | 
18 | 
19 | @apply_configs
20 | def delete_items(
21 |     items: list[dict[str, Any]],
22 |     table_name: str,
23 |     boto3_session: boto3.Session | None = None,
24 | ) -> None:
25 |     """Delete all items in the specified DynamoDB table.
26 | 
27 |     Parameters
28 |     ----------
29 |     items
30 |         List which contains the items that will be deleted.
31 |     table_name
32 |         Name of the Amazon DynamoDB table.
33 |     boto3_session
34 |         The default boto3 session will be used if **boto3_session** is ``None``.
35 | 
36 |     Examples
37 |     --------
38 |     Writing rows of DataFrame
39 | 
40 |     >>> import awswrangler as wr
41 |     >>> wr.dynamodb.delete_items(
42 |     ...     items=[{'key': 1}, {'key': 2, 'value': 'Hello'}],
43 |     ...     table_name='table'
44 |     ... )
45 |     """
46 |     _logger.debug("Deleting items from DynamoDB table %s", table_name)
47 | 
48 |     dynamodb_client = _utils.client(service_name="dynamodb", session=boto3_session)
49 |     serializer = TypeSerializer()
50 | 
51 |     key_schema = dynamodb_client.describe_table(TableName=table_name)["Table"]["KeySchema"]
52 |     _validate_items(items=items, key_schema=key_schema)
53 | 
54 |     table_keys = [schema["AttributeName"] for schema in key_schema]
55 | 
56 |     with _TableBatchWriter(table_name, dynamodb_client) as writer:
57 |         for item in items:
58 |             writer.delete_item(
59 |                 key={key: serializer.serialize(item[key]) for key in table_keys},
60 |             )
61 | 


--------------------------------------------------------------------------------
/awswrangler/neptune/__init__.py:
--------------------------------------------------------------------------------
 1 | """Utilities Module for Amazon Neptune."""
 2 | 
 3 | from awswrangler.neptune._client import BulkLoadParserConfiguration
 4 | from awswrangler.neptune._gremlin_parser import GremlinParser
 5 | from awswrangler.neptune._neptune import (
 6 |     bulk_load,
 7 |     bulk_load_from_files,
 8 |     connect,
 9 |     execute_gremlin,
10 |     execute_opencypher,
11 |     execute_sparql,
12 |     flatten_nested_df,
13 |     to_property_graph,
14 |     to_rdf_graph,
15 | )
16 | 
17 | __all__ = [
18 |     "execute_gremlin",
19 |     "execute_opencypher",
20 |     "execute_sparql",
21 |     "to_property_graph",
22 |     "to_rdf_graph",
23 |     "connect",
24 |     "bulk_load",
25 |     "bulk_load_from_files",
26 |     "GremlinParser",
27 |     "flatten_nested_df",
28 |     "BulkLoadParserConfiguration",
29 | ]
30 | 


--------------------------------------------------------------------------------
/awswrangler/neptune/_gremlin_init.py:
--------------------------------------------------------------------------------
 1 | """Gremlin Init Module."""
 2 | # Required because `gremlin_python` does not initialize its modules in __init__.py
 3 | 
 4 | from awswrangler._utils import import_optional_dependency
 5 | 
 6 | if import_optional_dependency("gremlin_python"):
 7 |     from gremlin_python.driver.client import Client
 8 |     from gremlin_python.process.anonymous_traversal import traversal
 9 |     from gremlin_python.process.graph_traversal import GraphTraversalSource, __
10 |     from gremlin_python.process.translator import Translator
11 |     from gremlin_python.process.traversal import Cardinality, T
12 |     from gremlin_python.structure.graph import Edge, Graph, Path, Property, Vertex, VertexProperty
13 | 
14 |     __all__ = [
15 |         "__",
16 |         "Cardinality",
17 |         "Client",
18 |         "Edge",
19 |         "Graph",
20 |         "GraphTraversalSource",
21 |         "Path",
22 |         "Property",
23 |         "T",
24 |         "Translator",
25 |         "traversal",
26 |         "Vertex",
27 |         "VertexProperty",
28 |     ]
29 | 


--------------------------------------------------------------------------------
/awswrangler/neptune/_gremlin_parser.py:
--------------------------------------------------------------------------------
 1 | # mypy: disable-error-code=name-defined
 2 | """Amazon Neptune GremlinParser Module (PRIVATE)."""
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | from typing import Any
 7 | 
 8 | import awswrangler.neptune._gremlin_init as gremlin
 9 | 
10 | 
11 | class GremlinParser:
12 |     """Class representing a parser for returning Gremlin results as a dictionary."""
13 | 
14 |     @staticmethod
15 |     def gremlin_results_to_dict(result: Any) -> list[dict[str, Any]]:
16 |         """Take a Gremlin ResultSet and return a dictionary.
17 | 
18 |         Parameters
19 |         ----------
20 |         result : Any
21 |             The Gremlin result set to convert
22 | 
23 |         Returns
24 |         -------
25 |         List[Dict[str, Any]]
26 |             A list of dictionary results
27 |         """
28 |         res = []
29 | 
30 |         # For lists or paths unwind them
31 |         if isinstance(result, (list, gremlin.Path)):
32 |             for x in result:
33 |                 res.append(GremlinParser._parse_dict(x))
34 | 
35 |         # For dictionaries just add them
36 |         elif isinstance(result, dict):
37 |             res.append(result)
38 | 
39 |         # For everything else parse them
40 |         else:
41 |             res.append(GremlinParser._parse_dict(result))
42 |         return res
43 | 
44 |     @staticmethod
45 |     def _parse_dict(data: Any) -> Any:
46 |         d: dict[str, Any] = {}
47 | 
48 |         # If this is a list or Path then unwind it
49 |         if isinstance(data, (list, gremlin.Path)):
50 |             res = []
51 |             for x in data:
52 |                 res.append(GremlinParser._parse_dict(x))
53 |             return res
54 | 
55 |         # If this is an element then make it a dictionary
56 |         if isinstance(
57 |             data,
58 |             (
59 |                 gremlin.Vertex,
60 |                 gremlin.Edge,
61 |                 gremlin.VertexProperty,
62 |                 gremlin.Property,
63 |             ),
64 |         ):
65 |             data = data.__dict__
66 | 
67 |         # If this is a scalar then create a Map with it
68 |         elif not hasattr(data, "__len__") or isinstance(data, str):
69 |             data = {0: data}
70 | 
71 |         for k, v in data.items():
72 |             # If the key is a Vertex or an Edge do special processing
73 |             if isinstance(k, (gremlin.Vertex, gremlin.Edge)):
74 |                 k = k.id  # noqa: PLW2901
75 | 
76 |             # If the value is a list do special processing to make it a scalar if the list is of length 1
77 |             if isinstance(v, list) and len(v) == 1:
78 |                 d[k] = v[0]
79 |             else:
80 |                 d[k] = v
81 | 
82 |             # If the value is a Vertex or Edge do special processing
83 |             if isinstance(
84 |                 data,
85 |                 (
86 |                     gremlin.Vertex,
87 |                     gremlin.Edge,
88 |                     gremlin.VertexProperty,
89 |                     gremlin.Property,
90 |                 ),
91 |             ):
92 |                 d[k] = d[k].__dict__
93 |         return d
94 | 


--------------------------------------------------------------------------------
/awswrangler/opensearch/__init__.py:
--------------------------------------------------------------------------------
 1 | """Utilities Module for Amazon OpenSearch."""
 2 | 
 3 | from awswrangler.opensearch._read import search, search_by_sql
 4 | from awswrangler.opensearch._utils import connect, create_collection
 5 | from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_df, index_documents, index_json
 6 | 
 7 | __all__ = [
 8 |     "connect",
 9 |     "create_collection",
10 |     "create_index",
11 |     "delete_index",
12 |     "index_csv",
13 |     "index_documents",
14 |     "index_df",
15 |     "index_json",
16 |     "search",
17 |     "search_by_sql",
18 | ]
19 | 


--------------------------------------------------------------------------------
/awswrangler/pandas/__init__.py:
--------------------------------------------------------------------------------
 1 | """Pandas "proxy" package."""
 2 | 
 3 | import logging
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from awswrangler._distributed import MemoryFormatEnum, memory_format
 7 | 
 8 | if TYPE_CHECKING or memory_format.get() == MemoryFormatEnum.PANDAS:
 9 |     from pandas import *  # noqa: F403
10 | 
11 |     # Explicit import because mypy doesn't support forward references to a star import
12 |     from pandas import (
13 |         DataFrame,
14 |         Series,
15 |         concat,
16 |         isna,
17 |         isnull,
18 |         json_normalize,
19 |         notna,
20 |         read_csv,
21 |         read_excel,
22 |         to_datetime,
23 |     )
24 | elif memory_format.get() == MemoryFormatEnum.MODIN:
25 |     from modin.pandas import *  # noqa: F403
26 | 
27 |     # Explicit import because mypy doesn't support forward references to a star import
28 |     from modin.pandas import (
29 |         DataFrame,
30 |         Series,
31 |         concat,
32 |         isna,
33 |         isnull,
34 |         json_normalize,
35 |         notna,
36 |         read_csv,
37 |         read_excel,
38 |         to_datetime,
39 |     )
40 | else:
41 |     raise ImportError(f"Unknown memory format {memory_format}")
42 | 
43 | _logger: logging.Logger = logging.getLogger(__name__)
44 | 
45 | __all__ = [
46 |     "DataFrame",
47 |     "Series",
48 |     "concat",
49 |     "isna",
50 |     "isnull",
51 |     "json_normalize",
52 |     "notna",
53 |     "read_csv",
54 |     "read_excel",
55 |     "to_datetime",
56 | ]
57 | 


--------------------------------------------------------------------------------
/awswrangler/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | 


--------------------------------------------------------------------------------
/awswrangler/quicksight/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon QuickSight Module."""
 2 | 
 3 | from awswrangler.quicksight._cancel import cancel_ingestion
 4 | from awswrangler.quicksight._create import create_athena_data_source, create_athena_dataset, create_ingestion
 5 | from awswrangler.quicksight._delete import (
 6 |     delete_all_dashboards,
 7 |     delete_all_data_sources,
 8 |     delete_all_datasets,
 9 |     delete_all_templates,
10 |     delete_dashboard,
11 |     delete_data_source,
12 |     delete_dataset,
13 |     delete_template,
14 | )
15 | from awswrangler.quicksight._describe import (
16 |     describe_dashboard,
17 |     describe_data_source,
18 |     describe_data_source_permissions,
19 |     describe_dataset,
20 |     describe_ingestion,
21 | )
22 | from awswrangler.quicksight._get_list import (
23 |     get_dashboard_id,
24 |     get_dashboard_ids,
25 |     get_data_source_arn,
26 |     get_data_source_arns,
27 |     get_data_source_id,
28 |     get_data_source_ids,
29 |     get_dataset_id,
30 |     get_dataset_ids,
31 |     get_template_id,
32 |     get_template_ids,
33 |     list_dashboards,
34 |     list_data_sources,
35 |     list_datasets,
36 |     list_group_memberships,
37 |     list_groups,
38 |     list_iam_policy_assignments,
39 |     list_iam_policy_assignments_for_user,
40 |     list_ingestions,
41 |     list_templates,
42 |     list_user_groups,
43 |     list_users,
44 | )
45 | 
46 | __all__ = [
47 |     "get_dashboard_id",
48 |     "get_dashboard_ids",
49 |     "get_data_source_arn",
50 |     "get_data_source_arns",
51 |     "get_data_source_id",
52 |     "get_data_source_ids",
53 |     "get_dataset_id",
54 |     "get_dataset_ids",
55 |     "get_template_id",
56 |     "get_template_ids",
57 |     "list_dashboards",
58 |     "list_data_sources",
59 |     "list_datasets",
60 |     "list_group_memberships",
61 |     "list_groups",
62 |     "list_iam_policy_assignments",
63 |     "list_iam_policy_assignments_for_user",
64 |     "list_ingestions",
65 |     "list_templates",
66 |     "list_user_groups",
67 |     "list_users",
68 |     "describe_dashboard",
69 |     "describe_data_source",
70 |     "describe_data_source_permissions",
71 |     "describe_dataset",
72 |     "describe_ingestion",
73 |     "delete_all_dashboards",
74 |     "delete_all_data_sources",
75 |     "delete_all_datasets",
76 |     "delete_all_templates",
77 |     "delete_dashboard",
78 |     "delete_data_source",
79 |     "delete_dataset",
80 |     "delete_template",
81 |     "cancel_ingestion",
82 |     "create_athena_data_source",
83 |     "create_athena_dataset",
84 |     "create_ingestion",
85 | ]
86 | 


--------------------------------------------------------------------------------
/awswrangler/quicksight/_cancel.py:
--------------------------------------------------------------------------------
 1 | """Amazon QuickSight Cancel Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import cast
 7 | 
 8 | import boto3
 9 | 
10 | from awswrangler import _utils, exceptions, sts
11 | from awswrangler.quicksight._get_list import get_dataset_id
12 | 
13 | _logger: logging.Logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def cancel_ingestion(
17 |     ingestion_id: str,
18 |     dataset_name: str | None = None,
19 |     dataset_id: str | None = None,
20 |     account_id: str | None = None,
21 |     boto3_session: boto3.Session | None = None,
22 | ) -> None:
23 |     """Cancel an ongoing ingestion of data into SPICE.
24 | 
25 |     Note
26 |     ----
27 |     You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument.
28 | 
29 |     Parameters
30 |     ----------
31 |     ingestion_id
32 |         Ingestion ID.
33 |     dataset_name
34 |         Dataset name.
35 |     dataset_id
36 |         Dataset ID.
37 |     account_id
38 |         If None, the account ID will be inferred from your boto3 session.
39 |     boto3_session
40 |         The default boto3 session will be used if **boto3_session** is ``None``.
41 | 
42 |     Examples
43 |     --------
44 |     >>> import awswrangler as wr
45 |     >>>  wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...")
46 |     """
47 |     if (dataset_name is None) and (dataset_id is None):
48 |         raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.")
49 |     if account_id is None:
50 |         account_id = sts.get_account_id(boto3_session=boto3_session)
51 |     if (dataset_id is None) and (dataset_name is not None):
52 |         dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=boto3_session)
53 |     client = _utils.client(service_name="quicksight", session=boto3_session)
54 |     dataset_id = cast(str, dataset_id)
55 |     client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id)
56 | 


--------------------------------------------------------------------------------
/awswrangler/quicksight/_utils.py:
--------------------------------------------------------------------------------
 1 | """Internal (private) Amazon QuickSight Utilities Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any, TypedDict
 7 | 
 8 | import boto3
 9 | from typing_extensions import NotRequired
10 | 
11 | from awswrangler import _data_types, athena, catalog, exceptions
12 | from awswrangler.quicksight._get_list import list_data_sources
13 | 
14 | _logger: logging.Logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class _QuicksightPrincipalList(TypedDict):
18 |     users: NotRequired[list[str]]
19 |     groups: NotRequired[list[str]]
20 | 
21 | 
22 | def extract_athena_table_columns(
23 |     database: str, table: str, boto3_session: boto3.Session | None
24 | ) -> list[dict[str, str]]:
25 |     """Extract athena columns data types from table and raising an exception if not exist."""
26 |     dtypes: dict[str, str] | None = catalog.get_table_types(database=database, table=table, boto3_session=boto3_session)
27 |     if dtypes is None:
28 |         raise exceptions.InvalidArgument(f"{database}.{table} does not exist on Athena.")
29 |     return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()]
30 | 
31 | 
32 | def extract_athena_query_columns(
33 |     sql: str, data_source_arn: str, account_id: str, boto3_session: boto3.Session | None
34 | ) -> list[dict[str, str]]:
35 |     """Extract athena columns data types from a SQL query."""
36 |     data_sources: list[dict[str, Any]] = list_data_sources(account_id=account_id, boto3_session=boto3_session)
37 |     data_source: dict[str, Any] = [x for x in data_sources if x["Arn"] == data_source_arn][0]
38 |     workgroup: str = data_source["DataSourceParameters"]["AthenaParameters"]["WorkGroup"]
39 |     sql_wrapped: str = f"/* QuickSight */\nSELECT ds.* FROM ( {sql} ) ds LIMIT 0"
40 |     query_id = athena.start_query_execution(sql=sql_wrapped, workgroup=workgroup, boto3_session=boto3_session)
41 |     athena.wait_query(query_execution_id=query_id, boto3_session=boto3_session)
42 |     dtypes: dict[str, str] = athena.get_query_columns_types(query_execution_id=query_id, boto3_session=boto3_session)
43 |     return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()]
44 | 


--------------------------------------------------------------------------------
/awswrangler/redshift/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon Redshift Module."""
 2 | 
 3 | from awswrangler.redshift._connect import connect, connect_temp
 4 | from awswrangler.redshift._read import read_sql_query, read_sql_table, unload, unload_to_files
 5 | from awswrangler.redshift._write import copy, copy_from_files, to_sql
 6 | 
 7 | __all__ = [
 8 |     "connect",
 9 |     "connect_temp",
10 |     "copy",
11 |     "copy_from_files",
12 |     "read_sql_query",
13 |     "read_sql_table",
14 |     "to_sql",
15 |     "unload",
16 |     "unload_to_files",
17 | ]
18 | 


--------------------------------------------------------------------------------
/awswrangler/s3/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Read Module."""
 2 | 
 3 | from awswrangler.s3._copy import copy_objects, merge_datasets
 4 | from awswrangler.s3._delete import delete_objects
 5 | from awswrangler.s3._describe import describe_objects, get_bucket_region, size_objects
 6 | from awswrangler.s3._download import download
 7 | from awswrangler.s3._list import does_object_exist, list_buckets, list_directories, list_objects
 8 | from awswrangler.s3._read_deltalake import read_deltalake
 9 | from awswrangler.s3._read_excel import read_excel
10 | from awswrangler.s3._read_orc import read_orc, read_orc_metadata, read_orc_table
11 | from awswrangler.s3._read_parquet import read_parquet, read_parquet_metadata, read_parquet_table
12 | from awswrangler.s3._read_text import read_csv, read_fwf, read_json
13 | from awswrangler.s3._select import select_query
14 | from awswrangler.s3._upload import upload
15 | from awswrangler.s3._wait import wait_objects_exist, wait_objects_not_exist
16 | from awswrangler.s3._write_deltalake import to_deltalake
17 | from awswrangler.s3._write_excel import to_excel
18 | from awswrangler.s3._write_orc import to_orc
19 | from awswrangler.s3._write_parquet import store_parquet_metadata, to_parquet
20 | from awswrangler.s3._write_text import to_csv, to_json
21 | 
22 | __all__ = [
23 |     "copy_objects",
24 |     "merge_datasets",
25 |     "delete_objects",
26 |     "describe_objects",
27 |     "get_bucket_region",
28 |     "size_objects",
29 |     "does_object_exist",
30 |     "list_buckets",
31 |     "list_directories",
32 |     "list_objects",
33 |     "read_deltalake",
34 |     "read_parquet",
35 |     "read_parquet_metadata",
36 |     "read_parquet_table",
37 |     "read_orc",
38 |     "read_orc_metadata",
39 |     "read_orc_table",
40 |     "read_csv",
41 |     "read_fwf",
42 |     "read_json",
43 |     "wait_objects_exist",
44 |     "wait_objects_not_exist",
45 |     "select_query",
46 |     "store_parquet_metadata",
47 |     "to_parquet",
48 |     "to_orc",
49 |     "to_csv",
50 |     "to_json",
51 |     "to_deltalake",
52 |     "to_excel",
53 |     "read_excel",
54 |     "download",
55 |     "upload",
56 | ]
57 | 


--------------------------------------------------------------------------------
/awswrangler/s3/_download.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Download Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any, cast
 7 | 
 8 | import boto3
 9 | 
10 | from awswrangler.s3._fs import open_s3_object
11 | 
12 | _logger: logging.Logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def download(
16 |     path: str,
17 |     local_file: str | Any,
18 |     version_id: str | None = None,
19 |     use_threads: bool | int = True,
20 |     boto3_session: boto3.Session | None = None,
21 |     s3_additional_kwargs: dict[str, Any] | None = None,
22 | ) -> None:
23 |     """Download file from a received S3 path to local file.
24 | 
25 |     Note
26 |     ----
27 |     In case of `use_threads=True` the number of threads
28 |     that will be spawned will be gotten from os.cpu_count().
29 | 
30 |     Parameters
31 |     ----------
32 |     path
33 |         S3 path (e.g. ``s3://bucket/key0``).
34 |     local_file
35 |         A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``).
36 |     version_id
37 |         Version id of the object.
38 |     use_threads
39 |         True to enable concurrent requests, False to disable multiple threads.
40 |         If enabled os.cpu_count() will be used as the max number of threads.
41 |         If integer is provided, specified number is used.
42 |     boto3_session
43 |         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
44 |     s3_additional_kwargs
45 |         Forward to botocore requests, only "SSECustomerAlgorithm", "SSECustomerKey" and "RequestPayer"
46 |         arguments will be considered.
47 | 
48 |     Returns
49 |     -------
50 |         None
51 | 
52 |     Examples
53 |     --------
54 |     Downloading a file using a path to local file
55 | 
56 |     >>> import awswrangler as wr
57 |     >>> wr.s3.download(path='s3://bucket/key', local_file='./key')
58 | 
59 |     Downloading a file using a file-like object
60 | 
61 |     >>> import awswrangler as wr
62 |     >>> with open(file='./key', mode='wb') as local_f:
63 |     >>>     wr.s3.download(path='s3://bucket/key', local_file=local_f)
64 | 
65 |     """
66 |     _logger.debug("path: %s", path)
67 |     with open_s3_object(
68 |         path=path,
69 |         mode="rb",
70 |         use_threads=use_threads,
71 |         version_id=version_id,
72 |         s3_block_size=-1,  # One shot download
73 |         s3_additional_kwargs=s3_additional_kwargs,
74 |         boto3_session=boto3_session,
75 |     ) as s3_f:
76 |         if isinstance(local_file, str):
77 |             _logger.debug("Downloading local_file: %s", local_file)
78 |             with open(file=local_file, mode="wb") as local_f:
79 |                 local_f.write(cast(bytes, s3_f.read()))
80 |         else:
81 |             _logger.debug("Downloading file-like object.")
82 |             local_file.write(s3_f.read())
83 | 


--------------------------------------------------------------------------------
/awswrangler/s3/_read_excel.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Excel Read Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | import boto3
 9 | 
10 | import awswrangler.pandas as pd
11 | from awswrangler import exceptions
12 | from awswrangler.s3._fs import open_s3_object
13 | 
14 | _logger: logging.Logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def read_excel(
18 |     path: str,
19 |     version_id: str | None = None,
20 |     use_threads: bool | int = True,
21 |     boto3_session: boto3.Session | None = None,
22 |     s3_additional_kwargs: dict[str, Any] | None = None,
23 |     **pandas_kwargs: Any,
24 | ) -> pd.DataFrame:
25 |     """Read EXCEL file(s) from a received S3 path.
26 | 
27 |     Note
28 |     ----
29 |     This function accepts any Pandas's read_excel() argument.
30 |     https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
31 | 
32 |     Note
33 |     ----
34 |     Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library
35 |     might have to be installed first.
36 | 
37 |     Note
38 |     ----
39 |     In case of `use_threads=True` the number of threads
40 |     that will be spawned will be gotten from os.cpu_count().
41 | 
42 |     Parameters
43 |     ----------
44 |     path
45 |         S3 path (e.g. ``s3://bucket/key.xlsx``).
46 |     version_id
47 |         Version id of the object.
48 |     use_threads
49 |         True to enable concurrent requests, False to disable multiple threads.
50 |         If enabled os.cpu_count() will be used as the max number of threads.
51 |         If given an int will use the given amount of threads.
52 |         If integer is provided, specified number is used.
53 |     boto3_session
54 |         Boto3 Session. The default boto3 session will be used if boto3_session receive None.
55 |     s3_additional_kwargs
56 |         Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.
57 |     pandas_kwargs:
58 |         KEYWORD arguments forwarded to pandas.read_excel(). You can NOT pass `pandas_kwargs` explicit, just add valid
59 |         Pandas arguments in the function call and awswrangler will accept it.
60 |         e.g. wr.s3.read_excel("s3://bucket/key.xlsx", na_rep="", verbose=True)
61 |         https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
62 | 
63 |     Returns
64 |     -------
65 |         Pandas DataFrame.
66 | 
67 |     Examples
68 |     --------
69 |     Reading an EXCEL file
70 | 
71 |     >>> import awswrangler as wr
72 |     >>> df = wr.s3.read_excel('s3://bucket/key.xlsx')
73 | 
74 |     """
75 |     if "pandas_kwargs" in pandas_kwargs:
76 |         raise exceptions.InvalidArgument(
77 |             "You can NOT pass `pandas_kwargs` explicit, just add valid "
78 |             "Pandas arguments in the function call and awswrangler will accept it."
79 |             "e.g. wr.s3.read_excel('s3://bucket/key.xlsx', na_rep='', verbose=True)"
80 |         )
81 |     with open_s3_object(
82 |         path=path,
83 |         mode="rb",
84 |         version_id=version_id,
85 |         use_threads=use_threads,
86 |         s3_block_size=-1,  # One shot download
87 |         s3_additional_kwargs=s3_additional_kwargs,
88 |         boto3_session=boto3_session,
89 |     ) as f:
90 |         _logger.debug("pandas_kwargs: %s", pandas_kwargs)
91 |         return pd.read_excel(f, **pandas_kwargs)
92 | 


--------------------------------------------------------------------------------
/awswrangler/s3/_upload.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Upload Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | import boto3
 9 | 
10 | from awswrangler.s3._fs import open_s3_object
11 | 
12 | _logger: logging.Logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def upload(
16 |     local_file: str | Any,
17 |     path: str,
18 |     use_threads: bool | int = True,
19 |     boto3_session: boto3.Session | None = None,
20 |     s3_additional_kwargs: dict[str, Any] | None = None,
21 | ) -> None:
22 |     """Upload file from a local file to received S3 path.
23 | 
24 |     Note
25 |     ----
26 |     In case of `use_threads=True` the number of threads
27 |     that will be spawned will be gotten from os.cpu_count().
28 | 
29 |     Parameters
30 |     ----------
31 |     local_file
32 |         A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``).
33 |     path
34 |         S3 path (e.g. ``s3://bucket/key0``).
35 |     use_threads
36 |         True to enable concurrent requests, False to disable multiple threads.
37 |         If enabled os.cpu_count() will be used as the max number of threads.
38 |         If integer is provided, specified number is used.
39 |     boto3_session
40 |         The default boto3 session will be used if boto3_session receive None.
41 |     pyarrow_additional_kwargs
42 |         Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.
43 | 
44 |     Returns
45 |     -------
46 |         None
47 | 
48 |     Examples
49 |     --------
50 |     Uploading a file using a path to local file
51 | 
52 |     >>> import awswrangler as wr
53 |     >>> wr.s3.upload(local_file='./key', path='s3://bucket/key')
54 | 
55 |     Uploading a file using a file-like object
56 | 
57 |     >>> import awswrangler as wr
58 |     >>> with open(file='./key', mode='wb') as local_f:
59 |     >>>     wr.s3.upload(local_file=local_f, path='s3://bucket/key')
60 | 
61 |     """
62 |     _logger.debug("path: %s", path)
63 |     with open_s3_object(
64 |         path=path,
65 |         mode="wb",
66 |         use_threads=use_threads,
67 |         s3_block_size=-1,  # One shot download
68 |         s3_additional_kwargs=s3_additional_kwargs,
69 |         boto3_session=boto3_session,
70 |     ) as s3_f:
71 |         if isinstance(local_file, str):
72 |             _logger.debug("Uploading local_file: %s", local_file)
73 |             with open(file=local_file, mode="rb") as local_f:
74 |                 s3_f.write(local_f.read())  # type: ignore[arg-type]
75 |         else:
76 |             _logger.debug("Uploading file-like object.")
77 |             s3_f.write(local_file.read())
78 | 


--------------------------------------------------------------------------------
/awswrangler/s3/_write_concurrent.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Concurrent Write Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import concurrent.futures
 6 | import logging
 7 | from typing import Any, Callable
 8 | 
 9 | import pandas as pd
10 | 
11 | from awswrangler import _utils
12 | 
13 | _logger: logging.Logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class _WriteProxy:
17 |     def __init__(self, use_threads: bool | int):
18 |         self._exec: concurrent.futures.ThreadPoolExecutor | None
19 |         self._results: list[str] = []
20 |         self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
21 |         if self._cpus > 1:
22 |             self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus)
23 |             self._futures: list[Any] = []
24 |         else:
25 |             self._exec = None
26 | 
27 |     @staticmethod
28 |     def _caller(func: Callable[..., pd.DataFrame], *args: Any, func_kwargs: dict[str, Any]) -> pd.DataFrame:
29 |         _logger.debug("Calling: %s", func)
30 |         return func(*args, **func_kwargs)
31 | 
32 |     def write(self, func: Callable[..., list[str]], *args: Any, **func_kwargs: Any) -> None:
33 |         """Write File."""
34 |         if self._exec is not None:
35 |             _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus)
36 |             _logger.debug("Submitting: %s", func)
37 |             future = self._exec.submit(
38 |                 _WriteProxy._caller,
39 |                 func,
40 |                 *args,
41 |                 func_kwargs=func_kwargs,
42 |             )
43 |             self._futures.append(future)
44 |         else:
45 |             self._results += func(*args, **func_kwargs)
46 | 
47 |     def close(self) -> list[str]:
48 |         """Close the proxy."""
49 |         if self._exec is not None:
50 |             for future in concurrent.futures.as_completed(self._futures):
51 |                 self._results += future.result()
52 |             self._exec.shutdown(wait=True)
53 |         return self._results
54 | 


--------------------------------------------------------------------------------
/awswrangler/s3/_write_excel.py:
--------------------------------------------------------------------------------
 1 | """Amazon S3 Excel Write Module (PRIVATE)."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import Any
 7 | 
 8 | import boto3
 9 | import pandas as pd
10 | 
11 | from awswrangler import exceptions
12 | from awswrangler.s3._fs import open_s3_object
13 | 
14 | _logger: logging.Logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def to_excel(
18 |     df: pd.DataFrame,
19 |     path: str,
20 |     boto3_session: boto3.Session | None = None,
21 |     s3_additional_kwargs: dict[str, Any] | None = None,
22 |     use_threads: bool | int = True,
23 |     **pandas_kwargs: Any,
24 | ) -> str:
25 |     """Write EXCEL file on Amazon S3.
26 | 
27 |     Note
28 |     ----
29 |     This function accepts any Pandas's read_excel() argument.
30 |     https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
31 | 
32 |     Note
33 |     ----
34 |     Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library
35 |     might have to be installed first.
36 | 
37 |     Note
38 |     ----
39 |     In case of `use_threads=True` the number of threads
40 |     that will be spawned will be gotten from os.cpu_count().
41 | 
42 |     Parameters
43 |     ----------
44 |     df
45 |         Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
46 |     path
47 |         Amazon S3 path (e.g. s3://bucket/filename.xlsx).
48 |     boto3_session
49 |         Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
50 |     pyarrow_additional_kwargs
51 |         Forwarded to botocore requests.
52 |         e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
53 |     use_threads
54 |         True to enable concurrent requests, False to disable multiple threads.
55 |         If enabled os.cpu_count() will be used as the max number of threads.
56 |         If integer is provided, specified number is used.
57 |     pandas_kwargs
58 |         KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add
59 |         valid Pandas arguments in the function call and awswrangler will accept it.
60 |         e.g. wr.s3.to_excel(df, path, na_rep="", index=False)
61 |         https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html
62 | 
63 |     Returns
64 |     -------
65 |         Written S3 path.
66 | 
67 |     Examples
68 |     --------
69 |     Writing EXCEL file
70 | 
71 |     >>> import awswrangler as wr
72 |     >>> import pandas as pd
73 |     >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx')
74 | 
75 |     """
76 |     if "pandas_kwargs" in pandas_kwargs:
77 |         raise exceptions.InvalidArgument(
78 |             "You can NOT pass `pandas_kwargs` explicit, just add valid "
79 |             "Pandas arguments in the function call and awswrangler will accept it."
80 |             "e.g. wr.s3.to_excel(df, path, na_rep="
81 |             ", index=False)"
82 |         )
83 |     with open_s3_object(
84 |         path=path,
85 |         mode="wb",
86 |         use_threads=use_threads,
87 |         s3_additional_kwargs=s3_additional_kwargs,
88 |         boto3_session=boto3_session,
89 |     ) as f:
90 |         _logger.debug("pandas_kwargs: %s", pandas_kwargs)
91 |         df.to_excel(f, **pandas_kwargs)
92 |     return path
93 | 


--------------------------------------------------------------------------------
/awswrangler/secretsmanager.py:
--------------------------------------------------------------------------------
 1 | """Secrets Manager module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import base64
 6 | import json
 7 | import logging
 8 | from typing import Any, Dict, cast
 9 | 
10 | import boto3
11 | 
12 | from awswrangler import _utils
13 | 
14 | _logger: logging.Logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def get_secret(name: str, boto3_session: boto3.Session | None = None) -> str | bytes:
18 |     """Get secret value.
19 | 
20 |     Parameters
21 |     ----------
22 |     name
23 |         Specifies the secret containing the version that you want to retrieve.
24 |         You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret.
25 |     boto3_session
26 |         The default boto3 session will be used if **boto3_session** is ``None``.
27 | 
28 |     Returns
29 |     -------
30 |         Secret value.
31 | 
32 |     Examples
33 |     --------
34 |     >>> import awswrangler as wr
35 |     >>> value = wr.secretsmanager.get_secret("my-secret")
36 | 
37 |     """
38 |     client = _utils.client(service_name="secretsmanager", session=boto3_session)
39 |     response = client.get_secret_value(SecretId=name)
40 |     if "SecretString" in response:
41 |         return response["SecretString"]
42 |     return base64.b64decode(response["SecretBinary"])
43 | 
44 | 
45 | def get_secret_json(name: str, boto3_session: boto3.Session | None = None) -> dict[str, Any]:
46 |     """Get JSON secret value.
47 | 
48 |     Parameters
49 |     ----------
50 |     name
51 |         Specifies the secret containing the version that you want to retrieve.
52 |         You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret.
53 |     boto3_session
54 |         The default boto3 session will be used if **boto3_session** is ``None``.
55 | 
56 |     Returns
57 |     -------
58 |         Secret JSON value parsed as a dictionary.
59 | 
60 |     Examples
61 |     --------
62 |     >>> import awswrangler as wr
63 |     >>> value = wr.secretsmanager.get_secret_json("my-secret-with-json-content")
64 | 
65 |     """
66 |     value = get_secret(name=name, boto3_session=boto3_session)
67 |     return cast(Dict[str, Any], json.loads(value))
68 | 


--------------------------------------------------------------------------------
/awswrangler/sts.py:
--------------------------------------------------------------------------------
 1 | """STS module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | 
 7 | import boto3
 8 | 
 9 | from awswrangler import _utils
10 | 
11 | _logger: logging.Logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def get_account_id(boto3_session: boto3.Session | None = None) -> str:
15 |     """Get Account ID.
16 | 
17 |     Parameters
18 |     ----------
19 |     boto3_session
20 |         The default boto3 session will be used if **boto3_session** is ``None``.
21 | 
22 |     Returns
23 |     -------
24 |         Account ID.
25 | 
26 |     Examples
27 |     --------
28 |     >>> import awswrangler as wr
29 |     >>> account_id = wr.sts.get_account_id()
30 | 
31 |     """
32 |     return _utils.client(service_name="sts", session=boto3_session).get_caller_identity()["Account"]
33 | 
34 | 
35 | def get_current_identity_arn(boto3_session: boto3.Session | None = None) -> str:
36 |     """Get current user/role ARN.
37 | 
38 |     Parameters
39 |     ----------
40 |     boto3_session
41 |         The default boto3 session will be used if **boto3_session** is ``None``.
42 | 
43 |     Returns
44 |     -------
45 |         User/role ARN.
46 | 
47 |     Examples
48 |     --------
49 |     >>> import awswrangler as wr
50 |     >>> arn = wr.sts.get_current_identity_arn()
51 | 
52 |     """
53 |     return _utils.client(service_name="sts", session=boto3_session).get_caller_identity()["Arn"]
54 | 
55 | 
56 | def get_current_identity_name(boto3_session: boto3.Session | None = None) -> str:
57 |     """Get current user/role name.
58 | 
59 |     Parameters
60 |     ----------
61 |     boto3_session
62 |         The default boto3 session will be used if **boto3_session** is ``None``.
63 | 
64 |     Returns
65 |     -------
66 |         User/role name.
67 | 
68 |     Examples
69 |     --------
70 |     >>> import awswrangler as wr
71 |     >>> name = wr.sts.get_current_identity_name()
72 | 
73 |     """
74 |     arn: str = get_current_identity_arn(boto3_session=boto3_session)
75 |     name: str = arn.rpartition("/")[-1]
76 |     return name
77 | 


--------------------------------------------------------------------------------
/awswrangler/timestream/__init__.py:
--------------------------------------------------------------------------------
 1 | """Amazon Timestream Module."""
 2 | 
 3 | from awswrangler.timestream._create import create_database, create_table
 4 | from awswrangler.timestream._delete import delete_database, delete_table
 5 | from awswrangler.timestream._list import list_databases, list_tables
 6 | from awswrangler.timestream._read import query, unload, unload_to_files
 7 | from awswrangler.timestream._write import (
 8 |     batch_load,
 9 |     batch_load_from_files,
10 |     wait_batch_load_task,
11 |     write,
12 | )
13 | 
14 | __all__ = [
15 |     "create_database",
16 |     "create_table",
17 |     "delete_database",
18 |     "delete_table",
19 |     "list_databases",
20 |     "list_tables",
21 |     "query",
22 |     "write",
23 |     "batch_load",
24 |     "batch_load_from_files",
25 |     "wait_batch_load_task",
26 |     "unload_to_files",
27 |     "unload",
28 | ]
29 | 


--------------------------------------------------------------------------------
/awswrangler/timestream/_delete.py:
--------------------------------------------------------------------------------
 1 | """Amazon Timestream Delete Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | 
 7 | import boto3
 8 | 
 9 | from awswrangler import _utils
10 | 
11 | _logger: logging.Logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def delete_database(
15 |     database: str,
16 |     boto3_session: boto3.Session | None = None,
17 | ) -> None:
18 |     """Delete a given Timestream database. This is an irreversible operation.
19 | 
20 |     After a database is deleted, the time series data from its tables cannot be recovered.
21 | 
22 |     All tables in the database must be deleted first, or a ValidationException error will be thrown.
23 | 
24 |     Due to the nature of distributed retries,
25 |     the operation can return either success or a ResourceNotFoundException.
26 |     Clients should consider them equivalent.
27 | 
28 |     Parameters
29 |     ----------
30 |     database
31 |         Database name.
32 |     boto3_session
33 |         The default boto3 session will be used if **boto3_session** is ``None``.
34 | 
35 |     Examples
36 |     --------
37 |     Deleting a database
38 | 
39 |     >>> import awswrangler as wr
40 |     >>> arn = wr.timestream.delete_database("MyDatabase")
41 | 
42 |     """
43 |     _logger.info("Deleting Timestream database %s", database)
44 |     client = _utils.client(service_name="timestream-write", session=boto3_session)
45 |     client.delete_database(DatabaseName=database)
46 | 
47 | 
48 | def delete_table(
49 |     database: str,
50 |     table: str,
51 |     boto3_session: boto3.Session | None = None,
52 | ) -> None:
53 |     """Delete a given Timestream table.
54 | 
55 |     This is an irreversible operation.
56 | 
57 |     After a Timestream database table is deleted, the time series data stored in the table cannot be recovered.
58 | 
59 |     Due to the nature of distributed retries,
60 |     the operation can return either success or a ResourceNotFoundException.
61 |     Clients should consider them equivalent.
62 | 
63 |     Parameters
64 |     ----------
65 |     database
66 |         Database name.
67 |     table
68 |         Table name.
69 |     boto3_session
70 |         The default boto3 session will be used if **boto3_session** is ``None``.
71 | 
72 |     Examples
73 |     --------
74 |     Deleting a table
75 | 
76 |     >>> import awswrangler as wr
77 |     >>> arn = wr.timestream.delete_table("MyDatabase", "MyTable")
78 | 
79 |     """
80 |     _logger.info("Deleting Timestream table %s in database %s", table, database)
81 |     client = _utils.client(service_name="timestream-write", session=boto3_session)
82 |     client.delete_table(DatabaseName=database, TableName=table)
83 | 


--------------------------------------------------------------------------------
/awswrangler/timestream/_list.py:
--------------------------------------------------------------------------------
 1 | """Amazon Timestream List Module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | 
 7 | import boto3
 8 | 
 9 | from awswrangler import _utils
10 | 
11 | _logger: logging.Logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def list_databases(
15 |     boto3_session: boto3.Session | None = None,
16 | ) -> list[str]:
17 |     """
18 |     List all databases in timestream.
19 | 
20 |     Parameters
21 |     ----------
22 |     boto3_session
23 |         The default boto3 session will be used if **boto3_session** is ``None``.
24 | 
25 |     Returns
26 |     -------
27 |         a list of available timestream databases.
28 | 
29 |     Examples
30 |     --------
31 |     Querying the list of all available databases
32 | 
33 |     >>> import awswrangler as wr
34 |     >>> wr.timestream.list_databases()
35 |     ["database1", "database2"]
36 | 
37 |     """
38 |     client = _utils.client(service_name="timestream-write", session=boto3_session)
39 | 
40 |     response = client.list_databases()
41 |     dbs: list[str] = [db["DatabaseName"] for db in response["Databases"]]
42 |     while "NextToken" in response:
43 |         response = client.list_databases(NextToken=response["NextToken"])
44 |         dbs += [db["DatabaseName"] for db in response["Databases"]]
45 | 
46 |     return dbs
47 | 
48 | 
49 | def list_tables(database: str | None = None, boto3_session: boto3.Session | None = None) -> list[str]:
50 |     """
51 |     List tables in timestream.
52 | 
53 |     Parameters
54 |     ----------
55 |     database
56 |         Database name. If None, all tables in Timestream will be returned. Otherwise, only the tables inside the
57 |         given database are returned.
58 |     boto3_session
59 |         The default boto3 session will be used if **boto3_session** is ``None``.
60 | 
61 |     Returns
62 |     -------
63 |         A list of table names.
64 | 
65 |     Examples
66 |     --------
67 |     Listing all tables in timestream across databases
68 | 
69 |     >>> import awswrangler as wr
70 |     >>> wr.timestream.list_tables()
71 |     ["table1", "table2"]
72 | 
73 |     Listing all tables in timestream in a specific database
74 | 
75 |     >>> import awswrangler as wr
76 |     >>> wr.timestream.list_tables(DatabaseName="database1")
77 |     ["table1"]
78 | 
79 |     """
80 |     client = _utils.client(service_name="timestream-write", session=boto3_session)
81 |     args = {} if database is None else {"DatabaseName": database}
82 |     response = client.list_tables(**args)  # type: ignore[arg-type]
83 |     tables: list[str] = [tbl["TableName"] for tbl in response["Tables"]]
84 |     while "NextToken" in response:
85 |         response = client.list_tables(**args, NextToken=response["NextToken"])  # type: ignore[arg-type]
86 |         tables += [tbl["TableName"] for tbl in response["Tables"]]
87 | 
88 |     return tables
89 | 


--------------------------------------------------------------------------------
/awswrangler/timestream/_read.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any, Iterator, Literal, overload
 2 | 
 3 | import boto3
 4 | import pandas as pd
 5 | 
 6 | @overload
 7 | def query(
 8 |     sql: str,
 9 |     chunked: Literal[False] = ...,
10 |     pagination_config: dict[str, Any] | None = ...,
11 |     boto3_session: boto3.Session | None = ...,
12 | ) -> pd.DataFrame: ...
13 | @overload
14 | def query(
15 |     sql: str,
16 |     chunked: Literal[True],
17 |     pagination_config: dict[str, Any] | None = ...,
18 |     boto3_session: boto3.Session | None = ...,
19 | ) -> Iterator[pd.DataFrame]: ...
20 | @overload
21 | def query(
22 |     sql: str,
23 |     chunked: bool,
24 |     pagination_config: dict[str, Any] | None = ...,
25 |     boto3_session: boto3.Session | None = ...,
26 | ) -> pd.DataFrame | Iterator[pd.DataFrame]: ...
27 | def unload(
28 |     sql: str,
29 |     path: str,
30 |     unload_format: Literal["CSV", "PARQUET"] | None = ...,
31 |     compression: Literal["GZIP", "..."] | None = ...,
32 |     partition_cols: list[str] | None = ...,
33 |     encryption: Literal["SSE_KMS", "SSE_S3"] | None = ...,
34 |     kms_key_id: str | None = ...,
35 |     field_delimiter: str | None = ",",
36 |     escaped_by: str | None = "\\",
37 |     chunked: bool | int = False,
38 |     keep_files: bool = False,
39 |     use_threads: bool | int = True,
40 |     boto3_session: boto3.Session | None = ...,
41 |     s3_additional_kwargs: dict[str, str] | None = ...,
42 |     pyarrow_additional_kwargs: dict[str, Any] | None = ...,
43 | ) -> pd.DataFrame | Iterator[pd.DataFrame]: ...
44 | def unload_to_files(
45 |     sql: str,
46 |     path: str,
47 |     unload_format: Literal["CSV", "PARQUET"] | None = ...,
48 |     compression: Literal["GZIP", "NONE"] | None = ...,
49 |     partition_cols: list[str] | None = ...,
50 |     encryption: Literal["SSE_KMS", "SSE_S3"] | None = ...,
51 |     kms_key_id: str | None = ...,
52 |     field_delimiter: str | None = ...,
53 |     escaped_by: str | None = ...,
54 |     boto3_session: boto3.Session | None = ...,
55 | ) -> None: ...
56 | 


--------------------------------------------------------------------------------
/building/build-docs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | pushd ..
5 | rm -rf docs/build docs/source/stubs
6 | make -C docs/ html
7 | doc8 --ignore-path docs/source/stubs --max-line-length 120 docs/source
8 | 


--------------------------------------------------------------------------------
/building/build-lambda-layers.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | VERSION=$(poetry version --short)
 5 | DIR_NAME=$(dirname "$PWD")
 6 | 
 7 | PYTHON_VERSION=${1:-ALL}
 8 | 
 9 | ARCH=$(arch)
10 | [ "${ARCH}" = "aarch64" ] && ARCH_SUFFIX="-arm64" # AWS Lambda, the name arm64 is used instead of aarch64
11 | 
12 | if [[ $PYTHON_VERSION == "ALL" ]]
13 | then
14 |   echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ALL supported Python versions)"
15 | else
16 |   echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ONLY Python $PYTHON_VERSION)"
17 | fi
18 | 
19 | pushd lambda
20 | 
21 | # Building all related docker images
22 | ./build-docker-images.sh $PYTHON_VERSION
23 | 
24 | # Python 3.9
25 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]]
26 | then
27 |   docker run \
28 |     --volume "$DIR_NAME":/aws-sdk-pandas/ \
29 |     --workdir /aws-sdk-pandas/building/lambda \
30 |     --rm \
31 |     awswrangler-build-py39 \
32 |     build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build"
33 | fi
34 | 
35 | # Python 3.10
36 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]]
37 | then
38 |   docker run \
39 |     --volume "$DIR_NAME":/aws-sdk-pandas/ \
40 |     --workdir /aws-sdk-pandas/building/lambda \
41 |     --rm \
42 |     awswrangler-build-py310 \
43 |     build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build"
44 | fi
45 | 
46 | # Python 3.11
47 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]]
48 | then
49 |   docker run \
50 |     --volume "$DIR_NAME":/aws-sdk-pandas/ \
51 |     --workdir /aws-sdk-pandas/building/lambda \
52 |     --rm \
53 |     awswrangler-build-py311 \
54 |     build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build"
55 | fi
56 | 
57 | # Python 3.12
58 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]]
59 | then
60 |   docker run \
61 |     --volume "$DIR_NAME":/aws-sdk-pandas/ \
62 |     --workdir /aws-sdk-pandas/building/lambda \
63 |     --rm \
64 |     awswrangler-build-py312 \
65 |     build-lambda-layer.sh "${VERSION}-py3.12${ARCH_SUFFIX}" "ninja-build"
66 | fi
67 | 
68 | # Python 3.13
69 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.13" ]]
70 | then
71 |   docker run \
72 |     --volume "$DIR_NAME":/aws-sdk-pandas/ \
73 |     --workdir /aws-sdk-pandas/building/lambda \
74 |     --rm \
75 |     awswrangler-build-py313 \
76 |     build-lambda-layer.sh "${VERSION}-py3.13${ARCH_SUFFIX}" "ninja-build"
77 | fi
78 | 


--------------------------------------------------------------------------------
/building/build-wheel.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | pushd ..
5 | rm -rf dist/*.whl
6 | poetry build -f wheel
7 | 


--------------------------------------------------------------------------------
/building/lambda/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG base_image
 2 | ARG python_version=base
 3 | 
 4 | FROM ${base_image} AS base
 5 | 
 6 | RUN yum install -y \
 7 |     boost-devel \
 8 |     jemalloc-devel \
 9 |     libxml2-devel \
10 |     libxslt-devel \
11 |     bison \
12 |     make \
13 |     gcc10 \
14 |     gcc10-c++ \
15 |     flex \
16 |     autoconf \
17 |     zip \
18 |     git \
19 |     ninja-build
20 | 
21 | WORKDIR /root
22 | 
23 | ENV CC=/usr/bin/gcc10-cc
24 | ENV CXX=/usr/bin/gcc10-c++
25 | ENV LD=/usr/bin/gcc10-gcc
26 | 
27 | RUN ln -s /usr/bin/gcc10-gcc /usr/bin/gcc
28 | RUN ln -s /usr/bin/gcc10-g++ /usr/bin/g++
29 | RUN ln -s /usr/bin/gcc10-nm /usr/bin/nm
30 | RUN ln -s /usr/bin/gcc10-ar /usr/bin/ar
31 | RUN ln -s /usr/bin/gcc10-mpn /usr/bin/mpn
32 | RUN ln -s /usr/bin/gcc10-ld /usr/bin/ld
33 | 
34 | FROM ${python_version}
35 | COPY pyproject.toml poetry.lock ./
36 | 
37 | # Setuptools is a build dependency of arrow and runtime dependency of some of our dependencies (mainly redshift-connector).
38 | # Remove when arrow version shipped with lambda layers and dependencies are updated.
39 | RUN pip3 install --upgrade pip wheel setuptools>=78.1.1 setuptools_scm>=8
40 | RUN pip3 install --upgrade urllib3==1.26.16  # temporary to avoid https://github.com/urllib3/urllib3/issues/2168 (TODO remove when the AL2 image updates to support OpenSSL 1.1.1+)
41 | # In new CMake 4, compatibility with CMake < 3.5 has been removed. 
42 | # Unpin CMake when arrow version shipped with lambda layers is updated.
43 | RUN pip3 install --upgrade cmake==3.31.6
44 | RUN pip3 install --upgrade six cython hypothesis poetry
45 | ENV PIP_NO_BINARY="numpy,pandas"
46 | RUN poetry config virtualenvs.create false --local && poetry install --no-root --only main
47 | 
48 | RUN rm -f pyproject.toml poetry.lock
49 | 
50 | ENTRYPOINT ["/bin/sh"]
51 | 


--------------------------------------------------------------------------------
/building/lambda/Dockerfile.al2023:
--------------------------------------------------------------------------------
 1 | ARG base_image
 2 | ARG python_version=base
 3 | 
 4 | FROM ${base_image} AS base
 5 | 
 6 | RUN dnf install -y \
 7 |     boost-devel \
 8 |     jemalloc-devel \
 9 |     libxml2-devel \
10 |     libxslt-devel \
11 |     bison \
12 |     make \
13 |     gcc \
14 |     gcc-c++ \
15 |     flex \
16 |     autoconf \
17 |     zip \
18 |     git \
19 |     ninja-build
20 | 
21 | WORKDIR /root
22 | 
23 | FROM ${python_version}
24 | COPY pyproject.toml poetry.lock ./
25 | 
26 | # Setuptools is a build dependency of arrow and runtime dependency of some of our dependencies (mainly redshift-connector).
27 | # Remove when arrow version shipped with lambda layers and dependencies are updated.
28 | RUN pip3 install --upgrade pip wheel setuptools>=78.1.1 setuptools_scm>=8
29 | # In new CMake 4, compatibility with CMake < 3.5 has been removed. 
30 | # Unpin CMake when arrow version shipped with lambda layers is updated.
31 | RUN pip3 install --upgrade cmake==3.31.6
32 | RUN pip3 install --upgrade six cython hypothesis poetry
33 | 
34 | ENV PIP_NO_BINARY="numpy,pandas"
35 | RUN poetry config virtualenvs.create false --local && poetry install --no-root --only main
36 | 
37 | RUN rm -f pyproject.toml poetry.lock
38 | 
39 | ENTRYPOINT ["/bin/sh"]
40 | 


--------------------------------------------------------------------------------
/building/lambda/build-docker-images.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | cp ../../pyproject.toml .
 5 | cp ../../poetry.lock .
 6 | 
 7 | export DOCKER_BUILDKIT=1
 8 | 
 9 | PYTHON_VERSION=${1:-ALL}
10 | 
11 | # Python 3.9
12 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]]
13 | then
14 |   docker build \
15 |     --pull \
16 |     --tag awswrangler-build-py39 \
17 |     --build-arg base_image=public.ecr.aws/lambda/python:3.9 \
18 |     .
19 | fi
20 | 
21 | # Python 3.10
22 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]]
23 | then
24 |   docker build \
25 |     --pull \
26 |     --tag awswrangler-build-py310 \
27 |     --build-arg base_image=public.ecr.aws/lambda/python:3.10 \
28 |     .
29 | fi
30 | 
31 | # Python 3.11
32 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]]
33 | then
34 |   docker build \
35 |     --pull \
36 |     --tag awswrangler-build-py311 \
37 |     --build-arg base_image=public.ecr.aws/lambda/python:3.11 \
38 |     .
39 | fi
40 | 
41 | # Python 3.12
42 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]]
43 | then
44 |   docker build \
45 |     --pull \
46 |     --tag awswrangler-build-py312 \
47 |     --build-arg base_image=public.ecr.aws/lambda/python:3.12 \
48 |     --file Dockerfile.al2023 \
49 |     .
50 | fi
51 | 
52 | # Python 3.13
53 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.13" ]]
54 | then
55 |   docker build \
56 |     --pull \
57 |     --tag awswrangler-build-py313 \
58 |     --build-arg base_image=public.ecr.aws/lambda/python:3.13 \
59 |     --file Dockerfile.al2023 \
60 |     .
61 | fi
62 | 
63 | rm -rf pyproject.toml poetry.lock
64 | 


--------------------------------------------------------------------------------
/building/lambda/build-lambda-layer.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -ex
  3 | 
  4 | FILENAME="awswrangler-layer-${1}.zip"
  5 | NINJA=${2}
  6 | 
  7 | pushd /aws-sdk-pandas
  8 | rm -rf python dist/pyarrow_files "dist/${FILENAME}" "${FILENAME}"
  9 | popd
 10 | 
 11 | rm -rf dist arrow
 12 | 
 13 | export ARROW_HOME=$(pwd)/dist
 14 | export ARROW_VERSION=20.0.0
 15 | export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
 16 | export CMAKE_PREFIX_PATH=$ARROW_HOME:$CMAKE_PREFIX_PATH
 17 | export SETUPTOOLS_SCM_PRETEND_VERSION=$ARROW_VERSION
 18 | 
 19 | git clone \
 20 |   --depth 1 \
 21 |   --branch "apache-arrow-${ARROW_VERSION}" \
 22 |   --single-branch \
 23 |   https://github.com/apache/arrow.git
 24 | 
 25 | mkdir $ARROW_HOME
 26 | mkdir arrow/cpp/build
 27 | pushd arrow/cpp/build
 28 | 
 29 | cmake \
 30 |     -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
 31 |     -DCMAKE_INSTALL_LIBDIR=lib \
 32 |     -DARROW_PYTHON=ON \
 33 |     -DARROW_PARQUET=ON \
 34 |     -DARROW_DATASET=ON \
 35 |     -DARROW_WITH_SNAPPY=ON \
 36 |     -DARROW_WITH_ZLIB=ON \
 37 |     -DARROW_FLIGHT=OFF \
 38 |     -DARROW_GANDIVA=OFF \
 39 |     -DARROW_ORC=OFF \
 40 |     -DARROW_CSV=ON \
 41 |     -DARROW_JSON=ON \
 42 |     -DARROW_COMPUTE=ON \
 43 |     -DARROW_FILESYSTEM=ON \
 44 |     -DARROW_PLASMA=OFF \
 45 |     -DARROW_WITH_BZ2=OFF \
 46 |     -DARROW_WITH_ZSTD=OFF \
 47 |     -DARROW_WITH_LZ4=OFF \
 48 |     -DARROW_WITH_BROTLI=OFF \
 49 |     -DARROW_BUILD_TESTS=OFF \
 50 |     -GNinja \
 51 |     ..
 52 | 
 53 | eval $NINJA
 54 | eval "${NINJA} install"
 55 | 
 56 | popd
 57 | 
 58 | pushd arrow/python
 59 | 
 60 | export CMAKE_PREFIX_PATH=${ARROW_HOME}${CMAKE_PREFIX_PATH:+:${CMAKE_PREFIX_PATH}}
 61 | export ARROW_PRE_0_15_IPC_FORMAT=0
 62 | export PYARROW_WITH_HDFS=0
 63 | export PYARROW_WITH_FLIGHT=0
 64 | export PYARROW_WITH_GANDIVA=0
 65 | export PYARROW_WITH_ORC=0
 66 | export PYARROW_WITH_CUDA=0
 67 | export PYARROW_WITH_PLASMA=0
 68 | export PYARROW_WITH_PARQUET=1
 69 | export PYARROW_WITH_DATASET=1
 70 | export PYARROW_WITH_FILESYSTEM=1
 71 | export PYARROW_WITH_CSV=1
 72 | export PYARROW_WITH_JSON=1
 73 | export PYARROW_WITH_COMPUTE=1
 74 | 
 75 | python3 setup.py build_ext \
 76 |   --build-type=release \
 77 |   --bundle-arrow-cpp \
 78 |   bdist_wheel
 79 | 
 80 | pip3 install dist/pyarrow-*.whl -t /aws-sdk-pandas/dist/pyarrow_files
 81 | 
 82 | popd
 83 | 
 84 | pushd /aws-sdk-pandas
 85 | 
 86 | pip3 install . --no-binary numpy,pandas -t ./python ".[redshift,mysql,postgres,gremlin,opensearch,openpyxl]"
 87 | 
 88 | rm -rf python/pyarrow*
 89 | rm -rf python/boto*
 90 | rm -rf python/urllib3*
 91 | rm -rf python/s3transfer*
 92 | 
 93 | cp -r /aws-sdk-pandas/dist/pyarrow_files/pyarrow* python/
 94 | 
 95 | # Removing nonessential files
 96 | find python -name '*.so' -type f -exec strip "{}" \;
 97 | find python -wholename "*/tests/*" -type f -delete
 98 | find python -regex '^.*\(__pycache__\|\.py[co]\)$' -delete
 99 | 
100 | zip -r9 "${FILENAME}" ./python
101 | mv "${FILENAME}" dist/
102 | 
103 | rm -rf python dist/pyarrow_files "${FILENAME}"
104 | 
105 | popd
106 | 
107 | rm -rf dist arrow
108 | 


--------------------------------------------------------------------------------
/building/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | pushd ..
5 | rm -fr dist
6 | poetry publish --build
7 | rm -fr dist
8 | 


--------------------------------------------------------------------------------
/building/update-glue-lib.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -ex
 3 | 
 4 | pushd ..
 5 | rm -fr awswrangler.zip
 6 | 
 7 | # Zip the library
 8 | git archive HEAD:awswrangler --format zip --prefix awswrangler/awswrangler/ --output awswrangler.zip
 9 | 
10 | # Upload the Zip file
11 | s3_location=$(aws cloudformation describe-stacks --stack-name aws-sdk-pandas-glueray --query "Stacks[0].Outputs[?OutputKey=='AWSSDKforpandasZIPLocation'].OutputValue" --output text)
12 | aws s3 cp awswrangler.zip $s3_location
13 | 
14 | rm -fr awswrangler.zip
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | dependencies:
 4 |   - python>=3
 5 |   - pandoc
 6 |   - ipykernel
 7 |   - pip
 8 |   - pip:
 9 |     - myst_parser
10 |     - nbsphinx
11 |     - nbsphinx-link
12 |     - sphinx==7.1.2
13 |     - sphinx-autodoc-typehints
14 |     - sphinx_bootstrap_theme
15 |     - sphinx-copybutton
16 |     - IPython
17 |     - ..
18 | 


--------------------------------------------------------------------------------
/docs/source/_ext/copy_adr.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def setup(app):
 6 |     file_dir = Path(__file__).parent
 7 | 
 8 |     source_dir = file_dir.joinpath("../../../adr").resolve()
 9 |     destination_dir = file_dir.joinpath("../adr/").resolve()
10 | 
11 |     for file in source_dir.glob("*.md"):
12 |         shutil.copy(file, destination_dir)
13 | 


--------------------------------------------------------------------------------
/docs/source/_ext/copy_tutorials.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def setup(app):
 6 |     file_dir = Path(__file__).parent
 7 |     for f in file_dir.joinpath("../../../tutorials").glob("*.ipynb"):
 8 |         with open(file_dir.joinpath(f"../tutorials/{f.stem}.nblink"), "w") as output_file:
 9 |             nb_link = {"path": f"../../../tutorials/{f.name}", "extra-media": ["../../../tutorials/_static"]}
10 |             json.dump(nb_link, output_file)
11 | 


--------------------------------------------------------------------------------
/docs/source/_static/aws_lambda_managed_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/aws_lambda_managed_layer.png


--------------------------------------------------------------------------------
/docs/source/_static/css/max_width.css:
--------------------------------------------------------------------------------
1 | div.body {
2 |     max-width: 90%;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/source/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo.png


--------------------------------------------------------------------------------
/docs/source/_static/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo2.png


--------------------------------------------------------------------------------
/docs/source/_static/logo_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo_transparent.png


--------------------------------------------------------------------------------
/docs/source/_static/logo_transparent_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo_transparent_small.png


--------------------------------------------------------------------------------
/docs/source/_static/ssm_public_parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/ssm_public_parameters.png


--------------------------------------------------------------------------------
/docs/source/_templates/globaltoc.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_templates/globaltoc.html


--------------------------------------------------------------------------------
/docs/source/_templates/typed-dict-template.rst:
--------------------------------------------------------------------------------
 1 | {{ objname }}
 2 | {{ underline }}
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 |     :show-inheritance:
 8 | 
 9 |     {% block attributes_summary %}
10 |     {% if attributes %}
11 | 
12 |     .. rubric:: Attributes
13 | 
14 |     .. autosummary::
15 |     {% for item in attributes %}
16 |         ~{{ name }}.{{ item }}
17 |     {%- endfor %}
18 | 
19 |     {% endif %}
20 |     {% endblock %}
21 | 
22 |     {% block methods_documentation %}
23 |     {% if methods %}
24 | 
25 |     .. rubric:: Attributes Documentation
26 | 
27 |     {% for item in attributes %}
28 |     .. autoattribute:: {{ item }}
29 |     {%- endfor %}
30 | 
31 |     {% endif %}
32 |     {% endblock %}
33 | 


--------------------------------------------------------------------------------
/docs/source/about.rst:
--------------------------------------------------------------------------------
 1 | What is AWS SDK for pandas?
 2 | ============================
 3 | 
 4 | An `AWS Professional Service <https://aws.amazon.com/professional-services>`_ `open source <https://github.com/aws/aws-sdk-pandas>`_ python initiative that extends the power of the `pandas <https://github.com/pandas-dev/pandas>`_ library to AWS, connecting **DataFrames** and AWS data & analytics services.
 5 | 
 6 | Easy integration with Athena, Glue, Redshift, Timestream, OpenSearch, Neptune, QuickSight, Chime, CloudWatchLogs,
 7 | DynamoDB, EMR, SecretManager, PostgreSQL, MySQL, SQLServer and S3 (Parquet, CSV, JSON and EXCEL).
 8 | 
 9 | Built on top of other open-source projects like `Pandas <https://github.com/pandas-dev/pandas>`_, `Apache Arrow <https://github.com/apache/arrow>`_ and `Boto3 <https://github.com/boto/boto3>`_, it offers abstracted functions to execute your usual ETL tasks like load/unloading data from **Data Lakes**, **Data Warehouses** and **Databases**, even `at scale <https://aws-sdk-pandas.readthedocs.io/en/stable/scale.html>`_.
10 | 
11 | Check our `tutorials <https://github.com/aws/aws-sdk-pandas/tree/main/tutorials>`_ or the `list of functionalities <https://aws-sdk-pandas.readthedocs.io/en/stable/api.html>`_.
12 | 


--------------------------------------------------------------------------------
/docs/source/adr.rst:
--------------------------------------------------------------------------------
 1 | Architectural Decision Records
 2 | ==============================
 3 | 
 4 | A collection of records for "architecturally significant" decisions:
 5 | those that affect the structure, non-functional characteristics, dependencies, interfaces, or construction techniques.
 6 | 
 7 | These decisions are made by the team which maintains *AWS SDK for pandas*.
 8 | However, suggestions can be submitted by any contributor via issues or pull requests.
 9 | 
10 | .. note:: You can also find all ADRs on `GitHub <https://github.com/aws/aws-sdk-pandas/tree/main/docs/adr>`_.
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :glob:
15 | 
16 |    adr/*
17 | 


--------------------------------------------------------------------------------
/docs/source/adr/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | An `AWS Professional Service <https://aws.amazon.com/professional-services>`_ open source initiative | aws-proserve-opensource@amazon.com
 2 | 
 3 | Quick Start
 4 | -----------
 5 | 
 6 |     >>> pip install awswrangler
 7 | 
 8 |     >>> # Optional modules are installed with:
 9 |     >>> pip install 'awswrangler[redshift]'
10 | 
11 | .. code-block:: py3
12 | 
13 |     import awswrangler as wr
14 |     import pandas as pd
15 |     from datetime import datetime
16 | 
17 |     df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"]})
18 | 
19 |     # Storing data on Data Lake
20 |     wr.s3.to_parquet(
21 |         df=df,
22 |         path="s3://bucket/dataset/",
23 |         dataset=True,
24 |         database="my_db",
25 |         table="my_table"
26 |     )
27 | 
28 |     # Retrieving the data directly from Amazon S3
29 |     df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True)
30 | 
31 |     # Retrieving the data from Amazon Athena
32 |     df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db")
33 | 
34 |     # Get a Redshift connection from Glue Catalog and retrieving data from Redshift Spectrum
35 |     con = wr.redshift.connect("my-glue-connection")
36 |     df = wr.redshift.read_sql_query("SELECT * FROM external_schema.my_table", con=con)
37 |     con.close()
38 | 
39 |     # Amazon Timestream Write
40 |     df = pd.DataFrame({
41 |         "time": [datetime.now(), datetime.now()],
42 |         "my_dimension": ["foo", "boo"],
43 |         "measure": [1.0, 1.1],
44 |     })
45 |     rejected_records = wr.timestream.write(df,
46 |         database="sampleDB",
47 |         table="sampleTable",
48 |         time_col="time",
49 |         measure_col="measure",
50 |         dimensions_cols=["my_dimension"],
51 |     )
52 | 
53 |     # Amazon Timestream Query
54 |     wr.timestream.query("""
55 |     SELECT time, measure_value::double, my_dimension
56 |     FROM "sampleDB"."sampleTable" ORDER BY time DESC LIMIT 3
57 |     """)
58 | 
59 | Read The Docs
60 | -------------
61 | 
62 | .. toctree::
63 |    :maxdepth: 2
64 | 
65 |    about
66 |    install
67 |    scale
68 |    tutorials
69 |    adr
70 |    api
71 |    Community Resources <https://github.com/aws/aws-sdk-pandas#community-resources>
72 |    Logging <https://github.com/aws/aws-sdk-pandas#logging>
73 |    Who uses AWS SDK for pandas? <https://github.com/aws/aws-sdk-pandas#who-uses-aws-sdk-for-pandas>
74 |    License <https://github.com/aws/aws-sdk-pandas/blob/main/LICENSE.txt>
75 |    Contributing <https://github.com/aws/aws-sdk-pandas/blob/main/CONTRIBUTING.md>
76 | 
77 | .. image:: https://d3tiqpr4kkkomd.cloudfront.net/img/pixel.png?asset=RIXAH6KDSYAI1HHEBLTY
78 |    :align: left
79 | 


--------------------------------------------------------------------------------
/docs/source/tutorials.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | .. note:: You can also find all Tutorial Notebooks on `GitHub <https://github.com/aws/aws-sdk-pandas/tree/main/tutorials>`_.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 1
 8 |    :glob:
 9 | 
10 |    tutorials/*
11 | 


--------------------------------------------------------------------------------
/docs/source/tutorials/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/fix.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | ruff format .
5 | ruff check --fix .


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | versions=${1:-ALL}
 3 | posargs=${2:-32}
 4 | SECONDS=0
 5 | 
 6 | set -e
 7 | 
 8 | mkdir -p test-reports
 9 | tox -e ${versions} -- ${posargs}
10 | if [ $versions = "ALL" ]; then
11 |     coverage html --directory coverage
12 |     rm -rf .coverage* Running 2> /dev/null
13 | fi
14 | 
15 | duration=$SECONDS
16 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
17 | 


--------------------------------------------------------------------------------
/test_infra/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | from aws_cdk import App, Environment
 5 | from stacks.base_stack import BaseStack
 6 | from stacks.cleanrooms_stack import CleanRoomsStack
 7 | from stacks.databases_stack import DatabasesStack
 8 | from stacks.glueray_stack import GlueRayStack
 9 | from stacks.opensearch_stack import OpenSearchStack
10 | 
11 | app = App()
12 | 
13 | env = {"env": Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"])}
14 | 
15 | base = BaseStack(
16 |     app,
17 |     "aws-sdk-pandas-base",
18 |     **env,
19 | )
20 | 
21 | DatabasesStack(
22 |     app,
23 |     "aws-sdk-pandas-databases",
24 |     base.get_vpc,
25 |     base.get_bucket,
26 |     base.get_key,
27 |     **env,
28 | )
29 | 
30 | OpenSearchStack(
31 |     app,
32 |     "aws-sdk-pandas-opensearch",
33 |     base.get_vpc,
34 |     base.get_bucket,
35 |     base.get_key,
36 |     **env,
37 | )
38 | 
39 | GlueRayStack(
40 |     app,
41 |     "aws-sdk-pandas-glueray",
42 |     base.get_bucket,
43 |     **env,
44 | )
45 | 
46 | CleanRoomsStack(
47 |     app,
48 |     "aws-sdk-pandas-cleanrooms",
49 |     base.get_bucket,
50 |     **env,
51 | )
52 | 
53 | app.synth()
54 | 


--------------------------------------------------------------------------------
/test_infra/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
23 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
24 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
25 |     "@aws-cdk/aws-iam:minimizePolicies": true,
26 |     "@aws-cdk/core:target-partitions": [
27 |       "aws",
28 |       "aws-cn"
29 |     ],
30 |     "databases": {
31 |       "redshift": true,
32 |       "postgresql": true,
33 |       "mysql": true,
34 |       "sqlserver": false,
35 |       "oracle": false,
36 |       "neptune": false
37 |     },
38 |     "network": "public"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/test_infra/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "awswrangler - test infrastructure"
 3 | version = "3.12.0"
 4 | description = "CDK test infrastructure for AWS SDK for pandas"
 5 | authors = ["Amazon Web Services"]
 6 | license = "Apache License 2.0"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.9, <4.0"
10 | "aws-cdk-lib" = "^2.188.0"
11 | "aws-cdk.aws-glue-alpha" = "^2.188.0a0"
12 | "aws-cdk.aws-neptune-alpha" = "^2.188.0a0"
13 | "aws-cdk.aws-redshift-alpha" = "^2.188.0a0"
14 | 


--------------------------------------------------------------------------------
/test_infra/scripts/delete-stack.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | STACK=${1}
4 | 
5 | pushd ..
6 | cdk destroy aws-sdk-pandas-${STACK}
7 | popd


--------------------------------------------------------------------------------
/test_infra/scripts/deploy-stack.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | STACK=${1}
4 | 
5 | pushd ..
6 | cdk bootstrap
7 | cdk deploy aws-sdk-pandas-${STACK}
8 | popd


--------------------------------------------------------------------------------
/test_infra/scripts/security-group-databases-add-local-ip.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Get my current IP address
 5 | LOCALIP=`host myip.opendns.com resolver1.opendns.com | grep myip | awk '{print $4}'`
 6 | 
 7 | # Get security group ID
 8 | SGID=`aws cloudformation  describe-stacks --stack-name aws-sdk-pandas-databases --query "Stacks[0].Outputs[?OutputKey=='DatabaseSecurityGroupId'].OutputValue" --output text`
 9 | 
10 | # Update Security Group with local ip
11 | aws ec2 authorize-security-group-ingress \
12 |   --group-id ${SGID} \
13 |   --protocol all \
14 |   --port -1 \
15 |   --cidr ${LOCALIP}/32
16 | 


--------------------------------------------------------------------------------
/test_infra/scripts/security-group-databases-check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | 
4 | # Get security group ID
5 | SGID=`aws cloudformation  describe-stacks --stack-name aws-sdk-pandas-databases --query "Stacks[0].Outputs[?OutputKey=='DatabaseSecurityGroupId'].OutputValue" --output text`
6 | 
7 | # Check to see current setting
8 | aws ec2 describe-security-groups --group-id ${SGID}
9 | 


--------------------------------------------------------------------------------
/test_infra/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/test_infra/stacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/test_infra/stacks/__init__.py


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/__init__.py


--------------------------------------------------------------------------------
/tests/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/benchmark/__init__.py


--------------------------------------------------------------------------------
/tests/glue_scripts/ray_read_small_parquet.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | import ray
4 | 
5 | import awswrangler as wr
6 | 
7 | paths = wr.s3.list_objects(f"s3://{os.environ['data-gen-bucket']}/parquet/small/partitioned/")
8 | ray.data.read_parquet_bulk(paths=paths, override_num_blocks=1000).to_modin()
9 | 


--------------------------------------------------------------------------------
/tests/glue_scripts/wrangler_blog_simple.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import awswrangler as wr
 4 | 
 5 | workgroup_name = os.environ["athena-workgroup"]
 6 | output_path = os.environ["output-path"]
 7 | glue_database = os.environ["glue-database"]
 8 | glue_table = os.environ["glue-table"]
 9 | 
10 | # Read 1.5 Gb Parquet data
11 | df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/")
12 | 
13 | # Drop vendor_id column
14 | df.drop("vendor_id", axis=1, inplace=True)
15 | 
16 | # Filter trips over 1 mile
17 | df1 = df[df["trip_distance"] > 1]
18 | 
19 | # Write partitioned trips to S3 in Parquet format
20 | wr.s3.to_parquet(
21 |     df1,
22 |     path=f"{output_path}output/{glue_table}/",
23 |     partition_cols=["passenger_count", "payment_type"],
24 |     dataset=True,
25 |     database=glue_database,
26 |     table=glue_table,
27 | )
28 | 
29 | # Read the data back to a modin df via Athena
30 | df1_athena = wr.athena.read_sql_query(
31 |     f"SELECT * FROM {glue_table}",
32 |     database=glue_database,
33 |     ctas_approach=False,
34 |     unload_approach=True,
35 |     workgroup=workgroup_name,
36 |     s3_output=f"{output_path}unload/{glue_table}/",
37 | )
38 | 
39 | # Delete table (required due to LF)
40 | wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table)
41 | 


--------------------------------------------------------------------------------
/tests/glue_scripts/wrangler_read_small_parquet.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | import awswrangler as wr
4 | 
5 | wr.s3.read_parquet(
6 |     path=f"s3://{os.environ['data-gen-bucket']}/parquet/small/partitioned/",
7 |     ray_args={"override_num_blocks": 1000, "bulk_read": True},
8 | )
9 | 


--------------------------------------------------------------------------------
/tests/glue_scripts/wrangler_write_partitioned_parquet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import awswrangler as wr
 4 | 
 5 | df = wr.s3.read_parquet(
 6 |     path=f"s3://{os.environ['data-gen-bucket']}/parquet/medium/partitioned/",
 7 |     ray_args={"override_num_blocks": 1000},
 8 | )
 9 | 
10 | wr.s3.to_parquet(
11 |     df=df,
12 |     path=os.environ["output-path"],
13 |     dataset=True,
14 |     partition_cols=["payment_type", "passenger_count"],
15 | )
16 | 


--------------------------------------------------------------------------------
/tests/load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/load/__init__.py


--------------------------------------------------------------------------------
/tests/load/conftest.py:
--------------------------------------------------------------------------------
 1 | import modin.pandas as pd
 2 | import pytest
 3 | import ray
 4 | from pyarrow import csv
 5 | 
 6 | import awswrangler as wr
 7 | 
 8 | 
 9 | @pytest.fixture(scope="function")
10 | def df_timestream() -> pd.DataFrame:
11 |     # Data frame with 126_000 rows
12 |     return (
13 |         ray.data.read_csv(
14 |             "https://raw.githubusercontent.com/awslabs/amazon-timestream-tools/mainline/sample_apps/data/sample.csv",
15 |             **{
16 |                 "read_options": csv.ReadOptions(
17 |                     column_names=[
18 |                         "ignore0",
19 |                         "region",
20 |                         "ignore1",
21 |                         "az",
22 |                         "ignore2",
23 |                         "hostname",
24 |                         "measure_kind",
25 |                         "measure",
26 |                         "ignore3",
27 |                         "ignore4",
28 |                         "ignore5",
29 |                     ]
30 |                 )
31 |             },
32 |         )
33 |         .to_modin()
34 |         .loc[:, ["region", "az", "hostname", "measure_kind", "measure"]]
35 |     )
36 | 
37 | 
38 | @pytest.fixture(scope="function")
39 | def df_s() -> pd.DataFrame:
40 |     # Data frame with 100000 rows
41 |     return wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2010/02/data.parquet")
42 | 
43 | 
44 | @pytest.fixture(scope="function")
45 | def df_xl() -> pd.DataFrame:
46 |     # Data frame with 8759874 rows
47 |     return wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2018/01/data.parquet")
48 | 
49 | 
50 | @pytest.fixture(scope="function")
51 | def big_modin_df() -> pd.DataFrame:
52 |     pandas_refs = ray.data.range(100_000).to_pandas_refs()
53 |     dataset = ray.data.from_pandas_refs(pandas_refs)
54 | 
55 |     frame = dataset.to_modin()
56 |     frame["foo"] = frame.id * 2
57 |     frame["bar"] = frame.id % 2
58 | 
59 |     return frame
60 | 


--------------------------------------------------------------------------------
/tests/load/test_dynamodb.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import random
  4 | from typing import Any
  5 | 
  6 | import boto3
  7 | import modin.pandas as pd
  8 | import pytest
  9 | import ray
 10 | 
 11 | import awswrangler as wr
 12 | 
 13 | from .._utils import ExecutionTimer
 14 | 
 15 | 
 16 | def _generate_item(id: int) -> dict[str, Any]:
 17 |     return {
 18 |         "id": str(id),
 19 |         "year": random.randint(1923, 2023),
 20 |         "title": f"{random.randrange(16**6):06x}",
 21 |     }
 22 | 
 23 | 
 24 | def _fill_dynamodb_table(table_name: str, num_objects: int) -> None:
 25 |     dynamodb_resource = boto3.resource("dynamodb")
 26 |     table = dynamodb_resource.Table(table_name)
 27 | 
 28 |     with table.batch_writer() as writer:
 29 |         for i in range(num_objects):
 30 |             item = _generate_item(i)
 31 |             writer.put_item(Item=item)
 32 | 
 33 | 
 34 | def create_big_modin_df(table_size: int, num_blocks: int | None) -> pd.DataFrame:
 35 |     pandas_refs = ray.data.range(table_size).to_pandas_refs()
 36 |     dataset = ray.data.from_pandas_refs(pandas_refs)
 37 | 
 38 |     if num_blocks:
 39 |         dataset = dataset.repartition(num_blocks=num_blocks)
 40 | 
 41 |     frame = dataset.to_modin()
 42 |     frame["foo"] = frame.id * 2
 43 |     frame["bar"] = frame.id % 2
 44 | 
 45 |     return frame
 46 | 
 47 | 
 48 | @pytest.mark.parametrize(
 49 |     "params",
 50 |     [
 51 |         {
 52 |             "KeySchema": [{"AttributeName": "id", "KeyType": "HASH"}, {"AttributeName": "year", "KeyType": "RANGE"}],
 53 |             "AttributeDefinitions": [
 54 |                 {"AttributeName": "id", "AttributeType": "S"},
 55 |                 {"AttributeName": "year", "AttributeType": "N"},
 56 |             ],
 57 |         }
 58 |     ],
 59 | )
 60 | def test_dynamodb_read(params: dict[str, Any], dynamodb_table: str, request: pytest.FixtureRequest) -> None:
 61 |     benchmark_time = 30
 62 |     num_objects = 50_000
 63 | 
 64 |     _fill_dynamodb_table(dynamodb_table, num_objects)
 65 | 
 66 |     with ExecutionTimer(request) as timer:
 67 |         frame = wr.dynamodb.read_items(table_name=dynamodb_table, allow_full_scan=True)
 68 | 
 69 |     assert len(frame) == num_objects
 70 |     assert timer.elapsed_time < benchmark_time
 71 | 
 72 | 
 73 | @pytest.mark.parametrize(
 74 |     "params",
 75 |     [
 76 |         {
 77 |             "KeySchema": [{"AttributeName": "id", "KeyType": "HASH"}],
 78 |             "AttributeDefinitions": [
 79 |                 {"AttributeName": "id", "AttributeType": "N"},
 80 |             ],
 81 |         }
 82 |     ],
 83 | )
 84 | @pytest.mark.parametrize("num_blocks", [2, 4, 8, None])
 85 | def test_dynamodb_write(
 86 |     params: dict[str, Any],
 87 |     num_blocks: int,
 88 |     dynamodb_table: str,
 89 |     request: pytest.FixtureRequest,
 90 | ) -> None:
 91 |     benchmark_time = 30
 92 |     big_modin_df = create_big_modin_df(25_000, num_blocks)
 93 | 
 94 |     with ExecutionTimer(request) as timer:
 95 |         wr.dynamodb.put_df(df=big_modin_df, table_name=dynamodb_table, use_threads=4)
 96 | 
 97 |     assert timer.elapsed_time < benchmark_time
 98 | 
 99 |     df_out = wr.dynamodb.read_items(dynamodb_table, allow_full_scan=True)
100 |     assert len(df_out) == len(big_modin_df)
101 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/test_athena_geospatial.py:
--------------------------------------------------------------------------------
 1 | import geopandas
 2 | import pandas as pd
 3 | import shapely
 4 | 
 5 | import awswrangler as wr
 6 | 
 7 | 
 8 | def test_athena_geospatial(path, glue_table, glue_database):
 9 |     df = wr.athena.read_sql_query(
10 |         """
11 |         SELECT
12 |             1 AS value
13 |             , ST_Point(-121.7602, 46.8527) AS point
14 |             , ST_LineFromText('LINESTRING(1 2, 3 4)') AS line
15 |             , ST_Polygon('POLYGON ((1 1, 1 4, 4 4, 4 1))') AS polygon
16 |             , ST_Polygon('POLYGON EMPTY') AS polygon_empty
17 |         """,
18 |         database=glue_database,
19 |         ctas_approach=False,
20 |     )
21 | 
22 |     assert isinstance(df, geopandas.GeoDataFrame)
23 | 
24 |     assert isinstance(df["value"], pd.Series)
25 |     assert isinstance(df["point"], geopandas.GeoSeries)
26 |     assert isinstance(df["line"], geopandas.GeoSeries)
27 |     assert isinstance(df["polygon"], geopandas.GeoSeries)
28 |     assert isinstance(df["polygon_empty"], geopandas.GeoSeries)
29 | 
30 |     assert isinstance(df["point"][0], shapely.geometry.point.Point)
31 |     assert isinstance(df["line"][0], shapely.geometry.linestring.LineString)
32 |     assert isinstance(df["polygon"][0], shapely.geometry.polygon.Polygon)
33 |     assert isinstance(df["polygon_empty"][0], shapely.geometry.polygon.Polygon)
34 | 


--------------------------------------------------------------------------------
/tests/unit/test_athena_spark.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import awswrangler as wr
 4 | from tests._utils import create_workgroup
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def athena_spark_execution_role_arn(cloudformation_outputs):
 9 |     return cloudformation_outputs["AthenaSparkExecutionRoleArn"]
10 | 
11 | 
12 | @pytest.fixture(scope="session")
13 | def workgroup_spark(bucket, kms_key, athena_spark_execution_role_arn):
14 |     return create_workgroup(
15 |         wkg_name="aws_sdk_pandas_spark",
16 |         config={
17 |             "EngineVersion": {
18 |                 "SelectedEngineVersion": "PySpark engine version 3",
19 |             },
20 |             "ExecutionRole": athena_spark_execution_role_arn,
21 |             "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup_spark/"},
22 |         },
23 |     )
24 | 
25 | 
26 | @pytest.mark.parametrize(
27 |     "code",
28 |     [
29 |         "print(spark)",
30 |         """
31 | input_path = "s3://athena-examples-us-east-1/notebooks/yellow_tripdata_2016-01.parquet"
32 | output_path = "$PATH"
33 | 
34 | taxi_df = spark.read.format("parquet").load(input_path)
35 | 
36 | taxi_passenger_counts = taxi_df.groupBy("VendorID", "passenger_count").count()
37 | taxi_passenger_counts.coalesce(1).write.mode('overwrite').csv(output_path)
38 |         """,
39 |     ],
40 | )
41 | def test_athena_spark_calculation(code, path, workgroup_spark):
42 |     code = code.replace("$PATH", path)
43 | 
44 |     result = wr.athena.run_spark_calculation(
45 |         code=code,
46 |         workgroup=workgroup_spark,
47 |     )
48 | 
49 |     assert result["Status"]["State"] == "COMPLETED"
50 | 
51 | 
52 | @pytest.mark.parametrize(
53 |     "code",
54 |     [
55 |         """
56 | output_path = "$PATH"
57 | 
58 | data = spark.range(0, 5)
59 | data.write.format("delta").save(output_path)
60 |         """,
61 |     ],
62 | )
63 | def test_athena_spark_calculation_with_spark_properties(code, path, workgroup_spark):
64 |     code = code.replace("$PATH", path)
65 | 
66 |     result = wr.athena.run_spark_calculation(
67 |         code=code,
68 |         workgroup=workgroup_spark,
69 |         spark_properties={
70 |             "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",
71 |             "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
72 |         },
73 |     )
74 |     assert result["Status"]["State"] == "COMPLETED"
75 | 


--------------------------------------------------------------------------------
/tests/unit/test_chime.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | import awswrangler as wr
 6 | 
 7 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 8 | 
 9 | 
10 | def test_chime_bad_input():
11 |     with pytest.raises(ValueError):
12 |         result = wr.chime.post_message(message=None, webhook=None)
13 |         assert result is None
14 | 


--------------------------------------------------------------------------------
/tests/unit/test_cleanrooms.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import awswrangler as wr
  4 | import awswrangler.pandas as pd
  5 | 
  6 | from .._utils import is_ray_modin
  7 | 
  8 | pytestmark = pytest.mark.distributed
  9 | 
 10 | 
 11 | @pytest.fixture()
 12 | def data(bucket: str, cleanrooms_glue_database_name: str) -> None:
 13 |     df_purchases = pd.DataFrame(
 14 |         {
 15 |             "purchase_id": list(range(100, 109)),
 16 |             "user_id": [1, 2, 3, 1, 2, 3, 4, 5, 6],
 17 |             "sale_value": [2.2, 1.1, 6.2, 2.3, 7.8, 9.9, 7.3, 9.7, 0.7],
 18 |         }
 19 |     )
 20 |     wr.s3.to_parquet(
 21 |         df_purchases,
 22 |         f"s3://{bucket}/purchases/",
 23 |         dataset=True,
 24 |         database=cleanrooms_glue_database_name,
 25 |         table="purchases",
 26 |         mode="overwrite",
 27 |     )
 28 | 
 29 |     df_users = pd.DataFrame(
 30 |         {
 31 |             "user_id": list(range(1, 9)),
 32 |             "city": ["LA", "NYC", "Chicago", "NYC", "NYC", "LA", "Seattle", "Seattle"],
 33 |         }
 34 |     )
 35 |     wr.s3.to_parquet(
 36 |         df_users,
 37 |         f"s3://{bucket}/users/",
 38 |         dataset=True,
 39 |         database=cleanrooms_glue_database_name,
 40 |         table="users",
 41 |         mode="overwrite",
 42 |     )
 43 | 
 44 |     df_custom = pd.DataFrame(
 45 |         {
 46 |             "a": list(range(1, 9)),
 47 |             "b": ["A", "A", "B", "C", "C", "C", "D", "E"],
 48 |         }
 49 |     )
 50 |     wr.s3.to_parquet(
 51 |         df_custom,
 52 |         f"s3://{bucket}/custom/",
 53 |         dataset=True,
 54 |         database=cleanrooms_glue_database_name,
 55 |         table="custom",
 56 |         mode="overwrite",
 57 |     )
 58 | 
 59 | 
 60 | @pytest.mark.xfail(
 61 |     is_ray_modin, raises=AssertionError, reason="Upgrade from pyarrow 16.1 to 17 causes AssertionError in Modin"
 62 | )
 63 | def test_read_sql_query(
 64 |     data: None,
 65 |     cleanrooms_membership_id: str,
 66 |     cleanrooms_analysis_template_arn: str,
 67 |     bucket: str,
 68 | ):
 69 |     sql = """SELECT city, AVG(p.sale_value)
 70 |     FROM users u
 71 |         INNER JOIN purchases p ON u.user_id = p.user_id
 72 |     GROUP BY city
 73 |     """
 74 |     chunksize = 2
 75 |     df_chunked = wr.cleanrooms.read_sql_query(
 76 |         sql=sql,
 77 |         membership_id=cleanrooms_membership_id,
 78 |         output_bucket=bucket,
 79 |         output_prefix="results",
 80 |         chunksize=chunksize,
 81 |         keep_files=False,
 82 |     )
 83 |     for df in df_chunked:
 84 |         assert df.shape == (chunksize, 2)
 85 | 
 86 |     sql = """SELECT COUNT(p.purchase_id), SUM(p.sale_value), city
 87 |     FROM users u
 88 |         INNER JOIN purchases p ON u.user_id = p.user_id
 89 |     GROUP BY city
 90 |     """
 91 |     df = wr.cleanrooms.read_sql_query(
 92 |         sql=sql,
 93 |         membership_id=cleanrooms_membership_id,
 94 |         output_bucket=bucket,
 95 |         output_prefix="results",
 96 |         keep_files=False,
 97 |     )
 98 |     assert df.shape == (2, 3)
 99 | 
100 |     df = wr.cleanrooms.read_sql_query(
101 |         analysis_template_arn=cleanrooms_analysis_template_arn,
102 |         params={"param1": "C"},
103 |         membership_id=cleanrooms_membership_id,
104 |         output_bucket=bucket,
105 |         output_prefix="results",
106 |         keep_files=False,
107 |     )
108 |     assert df.shape == (3, 1)
109 | 


--------------------------------------------------------------------------------
/tests/unit/test_distributed.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from importlib import reload
 3 | from types import ModuleType
 4 | from typing import Iterator
 5 | 
 6 | import pytest
 7 | 
 8 | from .._utils import is_ray_modin
 9 | 
10 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
11 | 
12 | pytestmark = pytest.mark.distributed
13 | 
14 | 
15 | @pytest.fixture(scope="function")
16 | def wr() -> Iterator[ModuleType]:
17 |     import awswrangler
18 | 
19 |     awswrangler.engine.__class__._engine = None
20 |     awswrangler.engine.__class__._initialized_engine = None
21 |     awswrangler.engine.__class__._registry.clear()
22 | 
23 |     yield reload(awswrangler)
24 | 
25 |     # Reset for future tests
26 |     awswrangler.engine.set(awswrangler.engine.get_installed().value)
27 |     awswrangler.memory_format.set(awswrangler.memory_format.get_installed().value)
28 | 
29 | 
30 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available")
31 | def test_engine_initialization(wr: ModuleType, path: str) -> None:
32 |     assert wr.engine.is_initialized()
33 | 
34 | 
35 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available")
36 | def test_engine_python(wr: ModuleType) -> None:
37 |     from awswrangler._distributed import EngineEnum
38 |     from awswrangler.s3._write_parquet import _to_parquet
39 | 
40 |     assert wr.engine.get_installed() == EngineEnum.RAY
41 |     assert wr.engine.get() == EngineEnum.RAY
42 | 
43 |     wr.engine.set(EngineEnum.PYTHON.value)
44 | 
45 |     assert wr.engine.get() == EngineEnum.PYTHON
46 | 
47 |     assert not wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed")
48 | 
49 | 
50 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available")
51 | def test_engine_ray(wr: ModuleType) -> None:
52 |     from awswrangler._distributed import EngineEnum
53 |     from awswrangler.s3._write_parquet import _to_parquet
54 | 
55 |     assert wr.engine.get_installed() == EngineEnum.RAY
56 |     assert wr.engine.get() == EngineEnum.RAY
57 | 
58 |     assert wr.engine._registry
59 |     assert wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed")
60 |     assert not wr.engine.dispatch_func(_to_parquet, "python").__name__.endswith("distributed")
61 | 
62 | 
63 | @pytest.mark.skipif(condition=is_ray_modin, reason="ray is installed")
64 | def test_engine_python_without_ray_installed(wr: ModuleType) -> None:
65 |     from awswrangler._distributed import EngineEnum
66 |     from awswrangler.s3._write_parquet import _to_parquet
67 | 
68 |     assert wr.engine.get_installed() == EngineEnum.PYTHON
69 |     assert wr.engine.get() == EngineEnum.PYTHON
70 | 
71 |     assert not wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed")
72 | 
73 | 
74 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available")
75 | def test_engine_switch(wr: ModuleType) -> None:
76 |     from modin.pandas import DataFrame as ModinDataFrame
77 |     from pandas import DataFrame as PandasDataFrame
78 | 
79 |     assert wr.engine.get_installed() == wr.EngineEnum.RAY
80 |     assert wr.memory_format.get_installed() == wr.MemoryFormatEnum.MODIN
81 | 
82 |     assert wr.engine.get() == wr.EngineEnum.RAY
83 |     assert wr.memory_format.get() == wr.MemoryFormatEnum.MODIN
84 |     assert wr.pandas.DataFrame == ModinDataFrame
85 | 
86 |     wr.engine.set("python")
87 |     wr.memory_format.set("pandas")
88 | 
89 |     assert wr.engine.get() == wr.EngineEnum.PYTHON
90 |     assert wr.memory_format.get() == wr.MemoryFormatEnum.PANDAS
91 |     assert wr.pandas.DataFrame == PandasDataFrame
92 | 


--------------------------------------------------------------------------------
/tests/unit/test_glue.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | import awswrangler as wr
 6 | import awswrangler.pandas as pd
 7 | 
 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 9 | 
10 | pytestmark = pytest.mark.distributed
11 | 
12 | 
13 | def test_parquet_crawler_columns(path):
14 |     df = pd.DataFrame({"c0": [0, 1], "c1": [2, 3]})
15 |     wr.s3.to_parquet(df, path, dataset=True, mode="overwrite")
16 |     df = pd.DataFrame({"c1": [2, 3], "c0": [0, 1]})
17 |     wr.s3.to_parquet(df, path, dataset=True, mode="append")
18 |     first_schema = wr.s3.read_parquet_metadata(path=path)[0]
19 |     for _ in range(10):
20 |         schema = wr.s3.read_parquet_metadata(path=path)[0]
21 |         assert list(schema.keys()) == list(first_schema.keys())
22 | 


--------------------------------------------------------------------------------
/tests/unit/test_metadata.py:
--------------------------------------------------------------------------------
1 | import awswrangler as wr
2 | 
3 | 
4 | def test_metadata():
5 |     assert wr.__version__ == "3.12.0"
6 |     assert wr.__title__ == "awswrangler"
7 |     assert wr.__description__ == "Pandas on AWS."
8 |     assert wr.__license__ == "Apache License 2.0"
9 | 


--------------------------------------------------------------------------------
/tests/unit/test_s3_excel.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | import awswrangler as wr
 6 | import awswrangler.pandas as pd
 7 | 
 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 9 | 
10 | 
11 | @pytest.mark.parametrize("ext", ["xlsx", "xlsm", "xls", "odf"])
12 | @pytest.mark.parametrize("use_threads", [True, False, 2])
13 | def test_excel(path, ext, use_threads):
14 |     df = pd.DataFrame({"c0": [1, 2, 3], "c1": ["foo", "boo", "bar"]})
15 |     file_path = f"{path}0.{ext}"
16 |     pandas_kwargs = {}
17 | 
18 |     with pytest.raises(wr.exceptions.InvalidArgument):
19 |         wr.s3.to_excel(df, file_path, use_threads=use_threads, index=False, pandas_kwargs=pandas_kwargs)
20 | 
21 |     wr.s3.to_excel(df, file_path, use_threads=use_threads, index=False, **pandas_kwargs)
22 | 
23 |     with pytest.raises(wr.exceptions.InvalidArgument):
24 |         wr.s3.read_excel(file_path, use_threads=use_threads, pandas_kwargs=pandas_kwargs)
25 | 
26 |     df2 = wr.s3.read_excel(file_path, use_threads=use_threads, **pandas_kwargs)
27 |     assert df.equals(df2)
28 | 
29 | 
30 | def test_read_xlsx_versioned(path) -> None:
31 |     path_file = f"{path}0.xlsx"
32 |     dfs = [pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5]}), pd.DataFrame({"c0": [3, 4, 5], "c1": [6, 7, 8]})]
33 |     pandas_kwargs = {}
34 |     for df in dfs:
35 |         wr.s3.to_excel(df=df, path=path_file, index=False, **pandas_kwargs)
36 |         version_id = wr.s3.describe_objects(path=path_file)[path_file]["VersionId"]
37 |         df_temp = wr.s3.read_excel(path_file, version_id=version_id, **pandas_kwargs)
38 |         assert df_temp.equals(df)
39 |         assert version_id == wr.s3.describe_objects(path=path_file, version_id=version_id)[path_file]["VersionId"]
40 | 


--------------------------------------------------------------------------------
/tests/unit/test_s3_wait.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | 
 5 | import awswrangler as wr
 6 | import awswrangler.pandas as pd
 7 | 
 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 9 | 
10 | pytestmark = pytest.mark.distributed
11 | 
12 | 
13 | @pytest.mark.parametrize("use_threads", [True, False])
14 | def test_wait_object_exists_single_file(path: str, use_threads: bool) -> None:
15 |     df = pd.DataFrame({"FooBoo": [1, 2, 3]})
16 |     file_path = f"{path}data.csv"
17 | 
18 |     wr.s3.to_csv(df, file_path)
19 | 
20 |     wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads)
21 | 
22 | 
23 | @pytest.mark.parametrize("use_threads", [True, False])
24 | def test_wait_object_exists_multiple_files(path: str, use_threads: bool) -> None:
25 |     df = pd.DataFrame({"FooBoo": [1, 2, 3]})
26 | 
27 |     file_paths = [f"{path}data.csv", f"{path}data2.csv", f"{path}data3.csv"]
28 |     for file_path in file_paths:
29 |         wr.s3.to_csv(df, file_path)
30 | 
31 |     wr.s3.wait_objects_exist(paths=file_paths, use_threads=use_threads)
32 | 
33 | 
34 | @pytest.mark.parametrize("use_threads", [True, False])
35 | def test_wait_object_not_exists(path: str, use_threads: bool) -> None:
36 |     wr.s3.wait_objects_not_exist(paths=[path], use_threads=use_threads)
37 | 
38 | 
39 | @pytest.mark.parametrize("use_threads", [True, False])
40 | @pytest.mark.timeout(30)
41 | def test_wait_object_timeout(path: str, use_threads: bool) -> None:
42 |     with pytest.raises(wr.exceptions.NoFilesFound):
43 |         wr.s3.wait_objects_exist(
44 |             paths=[path],
45 |             use_threads=use_threads,
46 |             delay=0.5,
47 |             max_attempts=3,
48 |         )
49 | 
50 | 
51 | @pytest.mark.parametrize("use_threads", [True, False])
52 | def test_wait_object_exists_empty_list(use_threads: bool) -> None:
53 |     wr.s3.wait_objects_exist(paths=[])
54 | 


--------------------------------------------------------------------------------
/tests/unit/test_session.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import boto3
 5 | 
 6 | import awswrangler as wr
 7 | 
 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 9 | 
10 | 
11 | def test_default_session():
12 |     boto3.setup_default_session(region_name="us-east-1")
13 |     assert wr._utils.ensure_session().region_name == "us-east-1"
14 |     boto3.setup_default_session(region_name="us-east-2")
15 |     assert wr._utils.ensure_session().region_name == "us-east-2"
16 |     boto3.setup_default_session(region_name="us-west-1")
17 |     assert wr._utils.ensure_session().region_name == "us-west-1"
18 |     boto3.setup_default_session(region_name=os.environ.get("AWS_DEFAULT_REGION", "us-west-2"))
19 |     assert wr._utils.ensure_session().region_name == os.environ.get("AWS_DEFAULT_REGION", "us-west-2")
20 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import pytest
 5 | 
 6 | from awswrangler._utils import ensure_cpu_count, get_even_chunks_sizes
 7 | 
 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "total_size,chunk_size,upper_bound,result",
13 |     [
14 |         (10, 4, True, (4, 3, 3)),
15 |         (2, 3, True, (2,)),
16 |         (1, 1, True, (1,)),
17 |         (2, 1, True, (1, 1)),
18 |         (11, 4, True, (4, 4, 3)),
19 |         (1_001, 500, True, (334, 334, 333)),
20 |         (1_002, 500, True, (334, 334, 334)),
21 |         (10, 4, False, (5, 5)),
22 |         (1, 1, False, (1,)),
23 |         (2, 1, False, (1, 1)),
24 |         (11, 4, False, (6, 5)),
25 |         (1_001, 500, False, (501, 500)),
26 |         (1_002, 500, False, (501, 501)),
27 |     ],
28 | )
29 | def test_get_even_chunks_sizes(total_size, chunk_size, upper_bound, result):
30 |     assert get_even_chunks_sizes(total_size, chunk_size, upper_bound) == result
31 | 
32 | 
33 | @pytest.mark.parametrize("use_threads,result", [(True, os.cpu_count()), (False, 1), (-1, 1), (1, 1), (5, 5)])
34 | def test_ensure_cpu_count(use_threads, result):
35 |     assert ensure_cpu_count(use_threads=use_threads) == result
36 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{39,310,311,312,313}
 3 | isolated_build = True
 4 | 
 5 | [testenv]
 6 | passenv =
 7 |        AWS_PROFILE
 8 |        AWS_DEFAULT_REGION
 9 |        AWS_ACCESS_KEY_ID
10 |        AWS_SECRET_ACCESS_KEY
11 |        AWS_SESSION_TOKEN
12 | setenv =
13 |        COV_FAIL_UNDER = 87.00
14 | allowlist_externals = 
15 |        pytest
16 |        poetry
17 | commands_pre =
18 |        poetry install --no-root --sync --extras "deltalake gremlin mysql opencypher opensearch oracle postgres redshift sparql sqlserver geopandas"
19 | commands =
20 |        pytest -n {posargs} -s -v --timeout=300 --reruns=2 --reruns-delay=15 \
21 |               --cov=awswrangler --cov-report=xml --cov-report term-missing --cov-branch \
22 |               --cov-fail-under={env:COV_FAIL_UNDER} \
23 |               --dist load --maxschedchunk 2 \
24 |               --junitxml=test-reports/junit.xml --log-file=test-reports/logs.txt tests/unit
25 | 
26 | [testenv:py{39,310,311,312,313}-distributed]
27 | passenv =
28 |        AWS_PROFILE
29 |        AWS_DEFAULT_REGION
30 |        AWS_ACCESS_KEY_ID
31 |        AWS_SECRET_ACCESS_KEY
32 |        AWS_SESSION_TOKEN
33 | setenv =
34 |        COV_FAIL_UNDER = 74.00
35 |        WR_CPU_COUNT = 16
36 | allowlist_externals = poetry
37 | commands_pre =
38 |        poetry install --no-root --sync --all-extras
39 | commands =
40 |        {[testenv]commands}
41 | 


--------------------------------------------------------------------------------
/tutorials/020 - Spark Table Interoperability.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n",
 8 |     "\n",
 9 |     "# 20 - Spark Table Interoperability\n",
10 |     "\n",
11 |     "[awswrangler](https://github.com/aws/aws-sdk-pandas) has no difficulty to insert, overwrite or do any other kind of interaction with a Table created by Apache Spark.\n",
12 |     "\n",
13 |     "But if you want to do the opposite (Spark interacting with a table created by awswrangler) you should be aware that awswrangler follows the Hive's format and you must be explicit when using the Spark's `saveAsTable` method:"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": null,
19 |    "metadata": {},
20 |    "outputs": [],
21 |    "source": [
22 |     "spark_df.write.format(\"hive\").saveAsTable(\"database.table\")"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "markdown",
27 |    "metadata": {},
28 |    "source": [
29 |     "Or just move forward using the `insertInto` alternative:"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {},
36 |    "outputs": [],
37 |    "source": [
38 |     "spark_df.write.insertInto(\"database.table\")"
39 |    ]
40 |   }
41 |  ],
42 |  "metadata": {
43 |   "kernelspec": {
44 |    "display_name": "Python 3.9.14",
45 |    "language": "python",
46 |    "name": "python3"
47 |   },
48 |   "language_info": {
49 |    "codemirror_mode": {
50 |     "name": "ipython",
51 |     "version": 3
52 |    },
53 |    "file_extension": ".py",
54 |    "mimetype": "text/x-python",
55 |    "name": "python",
56 |    "nbconvert_exporter": "python",
57 |    "pygments_lexer": "ipython3",
58 |    "version": "3.9.14"
59 |   },
60 |   "pycharm": {
61 |    "stem_cell": {
62 |     "cell_type": "raw",
63 |     "metadata": {
64 |      "collapsed": false
65 |     },
66 |     "source": []
67 |    }
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 4
72 | }
73 | 


--------------------------------------------------------------------------------
/tutorials/_static/glue_catalog_table_products.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_table_products.png


--------------------------------------------------------------------------------
/tutorials/_static/glue_catalog_version_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_version_0.png


--------------------------------------------------------------------------------
/tutorials/_static/glue_catalog_version_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_version_1.png


--------------------------------------------------------------------------------
/tutorials/_static/glue_is_create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_is_create.png


--------------------------------------------------------------------------------
/tutorials/_static/glue_is_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_is_setup.png


--------------------------------------------------------------------------------
/tutorials/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/logo.png


--------------------------------------------------------------------------------
/validate.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 | 
4 | ruff format --check .
5 | ruff check .
6 | mypy --install-types --non-interactive awswrangler
7 | doc8 --ignore-path docs/source/stubs --max-line-length 120 docs/source
8 | poetry check --lock
9 | 


--------------------------------------------------------------------------------