├── .adr-dir ├── .bumpversion.toml ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── enhancement-request.md │ ├── feature_request.md │ └── question.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── bandit.yml │ ├── cfn-nag.yml │ ├── check-pytest-xfails.yml │ ├── dependabot-prs.yml │ ├── git-hygiene.yml │ ├── minimal-tests.yml │ ├── minimum-response-time.yml │ ├── pr-linter.yml │ ├── snyk.yml │ ├── static-checking.yml │ └── unlabel-assigned-issue.yml ├── .gitignore ├── .readthedocs.yml ├── .snyk ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTING_COMMON_ERRORS.md ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── THIRD_PARTY.txt ├── VERSION ├── adr ├── 0001-record-architecture-decisions.md ├── 0002-handling-unsupported-arguments-in-distributed-mode.md ├── 0003-use-typeddict-to-group-similar-parameters.md ├── 0004-no-alter-iam-permissions.md ├── 0005-move-dependencies-to-optional.md ├── 0006-deprecate-s3-merge-upsert-table.md ├── 0007-design-of-engine-and-memory-format.md ├── 0008-switching-between-pyarrow-and-pandas-based-datasources-for-csv-json-i-o.md └── 0009-lazy-engine-initialization.md ├── awswrangler ├── __init__.py ├── __metadata__.py ├── _arrow.py ├── _config.py ├── _data_types.py ├── _databases.py ├── _distributed.py ├── _executor.py ├── _sql_formatter.py ├── _sql_utils.py ├── _utils.py ├── annotations.py ├── athena │ ├── __init__.py │ ├── _cache.py │ ├── _executions.py │ ├── _executions.pyi │ ├── _read.py │ ├── _read.pyi │ ├── _spark.py │ ├── _statements.py │ ├── _utils.py │ └── _write_iceberg.py ├── catalog │ ├── __init__.py │ ├── _add.py │ ├── _create.py │ ├── _definitions.py │ ├── _delete.py │ ├── _get.py │ └── _utils.py ├── chime.py ├── cleanrooms │ ├── __init__.py │ ├── _read.py │ └── _utils.py ├── cloudwatch.py ├── data_api │ ├── __init__.py │ ├── _connector.py │ ├── rds.py │ └── redshift.py ├── data_quality │ ├── __init__.py │ ├── _create.py │ ├── _get.py │ └── _utils.py ├── distributed │ ├── __init__.py │ └── ray │ │ ├── __init__.py │ │ ├── _core.py │ │ ├── _core.pyi │ │ ├── _executor.py │ │ ├── _register.py │ │ ├── _utils.py │ │ ├── datasources │ │ ├── __init__.py │ │ ├── arrow_csv_datasink.py │ │ ├── arrow_csv_datasource.py │ │ ├── arrow_json_datasource.py │ │ ├── arrow_orc_datasink.py │ │ ├── arrow_orc_datasource.py │ │ ├── arrow_parquet_base_datasource.py │ │ ├── arrow_parquet_datasink.py │ │ ├── arrow_parquet_datasource.py │ │ ├── file_datasink.py │ │ ├── filename_provider.py │ │ ├── pandas_text_datasink.py │ │ └── pandas_text_datasource.py │ │ ├── modin │ │ ├── __init__.py │ │ ├── _core.py │ │ ├── _data_types.py │ │ ├── _utils.py │ │ └── s3 │ │ │ ├── __init__.py │ │ │ ├── _read_orc.py │ │ │ ├── _read_parquet.py │ │ │ ├── _read_text.py │ │ │ ├── _write_dataset.py │ │ │ ├── _write_orc.py │ │ │ ├── _write_parquet.py │ │ │ └── _write_text.py │ │ └── s3 │ │ ├── __init__.py │ │ ├── _list.py │ │ ├── _read_orc.py │ │ └── _read_parquet.py ├── dynamodb │ ├── __init__.py │ ├── _delete.py │ ├── _read.py │ ├── _read.pyi │ ├── _utils.py │ └── _write.py ├── emr.py ├── emr_serverless.py ├── exceptions.py ├── mysql.py ├── neptune │ ├── __init__.py │ ├── _client.py │ ├── _gremlin_init.py │ ├── _gremlin_parser.py │ ├── _neptune.py │ └── _utils.py ├── opensearch │ ├── __init__.py │ ├── _read.py │ ├── _utils.py │ └── _write.py ├── oracle.py ├── pandas │ └── __init__.py ├── postgresql.py ├── py.typed ├── quicksight │ ├── __init__.py │ ├── _cancel.py │ ├── _create.py │ ├── _delete.py │ ├── _describe.py │ ├── _get_list.py │ └── _utils.py ├── redshift │ ├── __init__.py │ ├── _connect.py │ ├── _read.py │ ├── _read.pyi │ ├── _utils.py │ └── _write.py ├── s3 │ ├── __init__.py │ ├── _copy.py │ ├── _delete.py │ ├── _describe.py │ ├── _download.py │ ├── _fs.py │ ├── _list.py │ ├── _list.pyi │ ├── _read.py │ ├── _read_deltalake.py │ ├── _read_excel.py │ ├── _read_orc.py │ ├── _read_parquet.py │ ├── _read_parquet.pyi │ ├── _read_text.py │ ├── _read_text.pyi │ ├── _read_text_core.py │ ├── _select.py │ ├── _upload.py │ ├── _wait.py │ ├── _write.py │ ├── _write_concurrent.py │ ├── _write_dataset.py │ ├── _write_deltalake.py │ ├── _write_excel.py │ ├── _write_orc.py │ ├── _write_parquet.py │ └── _write_text.py ├── secretsmanager.py ├── sqlserver.py ├── sts.py ├── timestream │ ├── __init__.py │ ├── _create.py │ ├── _delete.py │ ├── _list.py │ ├── _read.py │ ├── _read.pyi │ └── _write.py └── typing.py ├── building ├── build-docs.sh ├── build-lambda-layers.sh ├── build-wheel.sh ├── lambda │ ├── Dockerfile │ ├── Dockerfile.al2023 │ ├── build-docker-images.sh │ └── build-lambda-layer.sh ├── publish.sh └── update-glue-lib.sh ├── docs ├── Makefile ├── environment.yml └── source │ ├── _ext │ ├── copy_adr.py │ └── copy_tutorials.py │ ├── _static │ ├── aws_lambda_managed_layer.png │ ├── css │ │ └── max_width.css │ ├── favicon.ico │ ├── logo.png │ ├── logo2.png │ ├── logo_transparent.png │ ├── logo_transparent_small.png │ └── ssm_public_parameters.png │ ├── _templates │ ├── globaltoc.html │ └── typed-dict-template.rst │ ├── about.rst │ ├── adr.rst │ ├── adr │ └── .gitignore │ ├── api.rst │ ├── conf.py │ ├── index.rst │ ├── install.rst │ ├── layers.rst │ ├── scale.rst │ ├── tutorials.rst │ └── tutorials │ └── .gitignore ├── fix.sh ├── poetry.lock ├── pyproject.toml ├── test.sh ├── test_infra ├── app.py ├── cdk.json ├── poetry.lock ├── pyproject.toml ├── scripts │ ├── delete-stack.sh │ ├── deploy-stack.sh │ ├── security-group-databases-add-local-ip.sh │ └── security-group-databases-check.sh ├── source.bat └── stacks │ ├── __init__.py │ ├── base_stack.py │ ├── cleanrooms_stack.py │ ├── databases_stack.py │ ├── glueray_stack.py │ └── opensearch_stack.py ├── tests ├── __init__.py ├── _utils.py ├── benchmark │ ├── __init__.py │ └── test_glueray.py ├── conftest.py ├── glue_scripts │ ├── ray_read_small_parquet.py │ ├── wrangler_blog_simple.py │ ├── wrangler_read_small_parquet.py │ └── wrangler_write_partitioned_parquet.py ├── load │ ├── __init__.py │ ├── conftest.py │ ├── test_databases.py │ ├── test_dynamodb.py │ ├── test_s3.py │ └── test_s3_modin.py └── unit │ ├── __init__.py │ ├── test_athena.py │ ├── test_athena_cache.py │ ├── test_athena_csv.py │ ├── test_athena_geospatial.py │ ├── test_athena_iceberg.py │ ├── test_athena_parquet.py │ ├── test_athena_prepared.py │ ├── test_athena_projection.py │ ├── test_athena_spark.py │ ├── test_catalog.py │ ├── test_chime.py │ ├── test_cleanrooms.py │ ├── test_cloudwatch.py │ ├── test_config.py │ ├── test_data_api.py │ ├── test_data_quality.py │ ├── test_distributed.py │ ├── test_dynamodb.py │ ├── test_emr.py │ ├── test_emr_serverless.py │ ├── test_fs.py │ ├── test_glue.py │ ├── test_metadata.py │ ├── test_moto.py │ ├── test_mysql.py │ ├── test_neptune.py │ ├── test_neptune_parsing.py │ ├── test_opensearch.py │ ├── test_oracle.py │ ├── test_pandas_pyarrow_dtype_backend.py │ ├── test_postgresql.py │ ├── test_quicksight.py │ ├── test_redshift.py │ ├── test_routines.py │ ├── test_s3.py │ ├── test_s3_deltalake.py │ ├── test_s3_excel.py │ ├── test_s3_orc.py │ ├── test_s3_parquet.py │ ├── test_s3_select.py │ ├── test_s3_text.py │ ├── test_s3_text_compressed.py │ ├── test_s3_wait.py │ ├── test_session.py │ ├── test_sql_params_formatter.py │ ├── test_sqlserver.py │ ├── test_timestream.py │ └── test_utils.py ├── tox.ini ├── tutorials ├── 001 - Introduction.ipynb ├── 002 - Sessions.ipynb ├── 003 - Amazon S3.ipynb ├── 004 - Parquet Datasets.ipynb ├── 005 - Glue Catalog.ipynb ├── 006 - Amazon Athena.ipynb ├── 007 - Redshift, MySQL, PostgreSQL, SQL Server, Oracle.ipynb ├── 008 - Redshift - Copy & Unload.ipynb ├── 009 - Redshift - Append, Overwrite, Upsert.ipynb ├── 010 - Parquet Crawler.ipynb ├── 011 - CSV Datasets.ipynb ├── 012 - CSV Crawler.ipynb ├── 013 - Merging Datasets on S3.ipynb ├── 014 - Schema Evolution.ipynb ├── 015 - EMR.ipynb ├── 016 - EMR & Docker.ipynb ├── 017 - Partition Projection.ipynb ├── 018 - QuickSight.ipynb ├── 019 - Athena Cache.ipynb ├── 020 - Spark Table Interoperability.ipynb ├── 021 - Global Configurations.ipynb ├── 022 - Writing Partitions Concurrently.ipynb ├── 023 - Flexible Partitions Filter.ipynb ├── 024 - Athena Query Metadata.ipynb ├── 025 - Redshift - Loading Parquet files with Spectrum.ipynb ├── 026 - Amazon Timestream.ipynb ├── 027 - Amazon Timestream 2.ipynb ├── 028 - DynamoDB.ipynb ├── 029 - S3 Select.ipynb ├── 030 - Data Api.ipynb ├── 031 - OpenSearch.ipynb ├── 033 - Amazon Neptune.ipynb ├── 034 - Distributing Calls using Ray.ipynb ├── 035 - Distributing Calls on Ray Remote Cluster.ipynb ├── 036 - Distributing Calls with Glue Interactive Sessions on Ray.ipynb ├── 037 - Glue Data Quality.ipynb ├── 038 - OpenSearch Serverless.ipynb ├── 039 - Athena Iceberg.ipynb ├── 040 - EMR Serverless.ipynb ├── 041 - Apache Spark on Amazon Athena.ipynb └── _static │ ├── glue_catalog_table_products.png │ ├── glue_catalog_version_0.png │ ├── glue_catalog_version_1.png │ ├── glue_is_create.png │ ├── glue_is_setup.png │ └── logo.png └── validate.sh /.adr-dir: -------------------------------------------------------------------------------- 1 | adr 2 | -------------------------------------------------------------------------------- /.bumpversion.toml: -------------------------------------------------------------------------------- 1 | [tool.bumpversion] 2 | current_version = "3.12.0" 3 | commit = false 4 | tag = false 5 | tag_name = "{new_version}" 6 | parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)((?P[a-z]+)(?P\\d+))?" 7 | serialize = [ 8 | "{major}.{minor}.{patch}{release}{build}", 9 | "{major}.{minor}.{patch}" 10 | ] 11 | 12 | [tool.bumpversion.parts.release] 13 | optional_value = "rc" 14 | values = [ 15 | "a", 16 | "b", 17 | "rc" 18 | ] 19 | 20 | [tool.bumpversion.parts.build] 21 | first_value = 1 22 | 23 | [[tool.bumpversion.files]] 24 | filename = "VERSION" 25 | 26 | [[tool.bumpversion.files]] 27 | filename = "pyproject.toml" 28 | search = "version = \"{current_version}\"" 29 | replace = "version = \"{new_version}\"" 30 | 31 | [[tool.bumpversion.files]] 32 | filename = "test_infra/pyproject.toml" 33 | search = "version = \"{current_version}\"" 34 | replace = "version = \"{new_version}\"" 35 | 36 | [[tool.bumpversion.files]] 37 | filename = "README.md" 38 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/" 39 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/" 40 | 41 | [[tool.bumpversion.files]] 42 | filename = "docs/source/install.rst" 43 | search = "awswrangler=={current_version}" 44 | replace = "awswrangler=={new_version}" 45 | 46 | [[tool.bumpversion.files]] 47 | filename = "awswrangler/__metadata__.py" 48 | 49 | [[tool.bumpversion.files]] 50 | filename = "tests/unit/test_metadata.py" 51 | search = "assert wr.__version__ == \"{current_version}\"" 52 | replace = "assert wr.__version__ == \"{new_version}\"" 53 | 54 | [[tool.bumpversion.files]] 55 | glob = "awswrangler/**/*.py" 56 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/" 57 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/" 58 | ignore_missing_version = true 59 | 60 | [[tool.bumpversion.files]] 61 | glob = "tutorials/*.ipynb" 62 | search = "https://aws-sdk-pandas.readthedocs.io/en/{current_version}/" 63 | replace = "https://aws-sdk-pandas.readthedocs.io/en/{new_version}/" 64 | ignore_missing_version = true 65 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Create a report to help us improve. 3 | labels: "bug" 4 | 5 | body: 6 | - type: textarea 7 | attributes: 8 | label: Describe the bug 9 | description: >- 10 | A clear description of what the bug is. Include a stack trace if present. 11 | validations: 12 | required: true 13 | 14 | - type: textarea 15 | attributes: 16 | label: How to Reproduce 17 | description: Steps to reproduce the behavior. 18 | value: | 19 | ``` 20 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.* 21 | ``` 22 | validations: 23 | required: true 24 | 25 | - type: textarea 26 | attributes: 27 | label: Expected behavior 28 | description: >- 29 | A clear and concise description of what you expected to happen. 30 | 31 | - type: input 32 | attributes: 33 | label: Your project 34 | description: >- 35 | Link to your project. 36 | validations: 37 | required: false 38 | 39 | - type: textarea 40 | attributes: 41 | label: Screenshots 42 | description: >- 43 | If applicable, add screenshots to help explain your problem. 44 | validations: 45 | required: false 46 | 47 | - type: input 48 | attributes: 49 | label: OS 50 | description: >- 51 | [e.g. Unix/Linux/Mac/Win/other with version] 52 | validations: 53 | required: true 54 | - type: input 55 | attributes: 56 | label: Python version 57 | validations: 58 | required: true 59 | - type: input 60 | attributes: 61 | label: AWS SDK for pandas version 62 | validations: 63 | required: true 64 | - type: textarea 65 | attributes: 66 | label: Additional context 67 | description: >- 68 | Add any other context about the problem here. 69 | [e.g. URL or Ticket] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | contact_links: 2 | - name: Discussion 3 | url: https://join.slack.com/t/aws-sdk-pandas/shared_invite/zt-sxdx38sl-E0coRfAds8WdpxXD2Nzfrg 4 | about: Public Slack channel for the AWS SDK for pandas community. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement request 3 | about: Suggest an idea to enhance some existing feature 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your idea related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you are expecting. 15 | 16 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.* 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you are expecting. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | 22 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.* -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask with as many useful details as possible 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | *P.S. Please do not attach files as it's considered a security risk. Add code snippets directly in the message body as much as possible.* 11 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Feature or Bugfix 2 | 3 | - Feature 4 | - Bugfix 5 | - Refactoring 6 | 7 | ### Detail 8 | - 9 | - 10 | 11 | ### Relates 12 | - 13 | 14 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 15 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | time: "09:00" 8 | timezone: "Europe/London" 9 | groups: 10 | production-dependencies: 11 | dependency-type: "production" 12 | development-dependencies: 13 | dependency-type: "development" 14 | 15 | - package-ecosystem: "github-actions" 16 | directory: "/" 17 | schedule: 18 | schedule: 19 | interval: "weekly" 20 | time: "09:00" 21 | timezone: "Europe/London" 22 | groups: 23 | github-actions: 24 | patterns: 25 | - "*" 26 | -------------------------------------------------------------------------------- /.github/workflows/bandit.yml: -------------------------------------------------------------------------------- 1 | name: Bandit 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: 3.9 24 | - name: Install 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install bandit 28 | - name: Bandit 29 | run: bandit -r -lll -ii . 30 | -------------------------------------------------------------------------------- /.github/workflows/cfn-nag.yml: -------------------------------------------------------------------------------- 1 | name: CFN Nag 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | paths: 7 | - "test_infra/**" 8 | branches: 9 | - main 10 | pull_request: 11 | paths: 12 | - "test_infra/**" 13 | branches: 14 | - main 15 | 16 | permissions: 17 | contents: read 18 | 19 | env: 20 | CDK_DEFAULT_ACCOUNT: 111111111111 21 | CDK_DEFAULT_REGION: us-east-1 22 | 23 | jobs: 24 | build: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Use Node.js 29 | uses: actions/setup-node@v4 30 | with: 31 | node-version: 16 32 | - name: Cache Node.js modules 33 | uses: actions/cache@v4 34 | with: 35 | path: ~/.npm 36 | key: ${{ runner.OS }}-node-${{ hashFiles('**/package-lock.json') }} 37 | restore-keys: | 38 | ${{ runner.OS }}-node- 39 | ${{ runner.OS }}- 40 | - name: Install CDK 41 | run: | 42 | npm install -g aws-cdk 43 | cdk --version 44 | - uses: actions/checkout@v4 45 | - name: Set up Python 46 | uses: actions/setup-python@v5 47 | with: 48 | python-version: 3.11 49 | - name: Install Requirements 50 | run: | 51 | cd test_infra 52 | python -m pip install --upgrade pip 53 | python -m pip install poetry 54 | poetry env use python 55 | poetry env info 56 | source $(poetry env info --path)/bin/activate 57 | poetry install -vvv --no-root 58 | - name: Set up cdk.json 59 | run: | 60 | cd test_infra 61 | cat <> cdk.context.json 62 | { 63 | "availability-zones:account=111111111111:region=us-east-1": [ 64 | "us-east-1a", 65 | "us-east-1b", 66 | "us-east-1c", 67 | "us-east-1d", 68 | "us-east-1e", 69 | "us-east-1f" 70 | ] 71 | } 72 | EOT 73 | cat cdk.json | jq -r '.context.databases.neptune = true' | jq -r '.context.databases.oracle = true' | jq -r '.context.databases.sqlserver = true' > overwrite.cdk.json 74 | rm cdk.json && mv overwrite.cdk.json cdk.json 75 | - name: CDK Synth 76 | run: | 77 | cd test_infra 78 | source $(poetry env info --path)/bin/activate 79 | cdk synth 80 | - uses: stelligent/cfn_nag@master 81 | with: 82 | input_path: test_infra/cdk.out 83 | extra_args: --ignore-fatal 84 | -------------------------------------------------------------------------------- /.github/workflows/check-pytest-xfails.yml: -------------------------------------------------------------------------------- 1 | name: Check Tests for Unspecific XFails 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | Check: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: check xfails 20 | run: if grep -ro "@pytest.mark.xfail()" tests/; then echo "xfails must catch a specific error, e.g. '@pytest.mark.xfail(raises=NotImplementedError)'" && exit 1; else echo "success" && exit 0; fi -------------------------------------------------------------------------------- /.github/workflows/dependabot-prs.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot Pull Request Metadata 2 | on: pull_request_target 3 | jobs: 4 | build: 5 | permissions: 6 | pull-requests: read 7 | runs-on: ubuntu-latest 8 | if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }} 9 | steps: 10 | - name: Fetch Dependabot metadata 11 | id: dependabot-metadata 12 | uses: dependabot/fetch-metadata@v2 13 | with: 14 | alert-lookup: true 15 | compat-lookup: true 16 | github-token: ${{ secrets.GITHUB_TOKEN }} 17 | - name: Add a label for all PRs with an alert state 18 | if: ${{ steps.dependabot-metadata.outputs.alert-state != '' }} 19 | run: gh pr edit "$PR_URL" --add-label "vulnerability" 20 | env: 21 | PR_URL: ${{github.event.pull_request.html_url}} 22 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} -------------------------------------------------------------------------------- /.github/workflows/git-hygiene.yml: -------------------------------------------------------------------------------- 1 | name: "Close Stale Issues" 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 */3 * * *" 7 | 8 | jobs: 9 | cleanup: 10 | permissions: 11 | issues: write 12 | pull-requests: write 13 | runs-on: ubuntu-latest 14 | name: Stale issue job 15 | steps: 16 | - uses: actions/stale@v9 17 | with: 18 | repo-token: ${{ secrets.GITHUB_TOKEN }} 19 | days-before-stale: 60 20 | days-before-close: 7 21 | exempt-issue-labels: 'needs-triage,help wanted,backlog' 22 | exempt-pr-labels: 'needs-triage' 23 | stale-issue-label: 'closing-soon' 24 | operations-per-run: 100 25 | enable-statistics: true 26 | stale-issue-message: | 27 | Marking this issue as stale due to inactivity. This helps our maintainers find and focus on the active issues. If this issue receives no comments in the next 7 days it will automatically be closed. 28 | stale-pr-label: 'closing-soon' 29 | stale-pr-message: | 30 | Marking this pull request as stale due to inactivity. This helps our maintainers find and focus on the active pull requests. 31 | debug-only: false 32 | ascending: true 33 | exempt-all-milestones: true 34 | exempt-all-assignees: true 35 | -------------------------------------------------------------------------------- /.github/workflows/minimal-tests.yml: -------------------------------------------------------------------------------- 1 | name: Minimal Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | Check: 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 21 | platform: [ubuntu-latest, macos-latest, windows-latest] 22 | exclude: 23 | - python-version: 3.13 24 | platform: windows-latest 25 | 26 | env: 27 | AWS_DEFAULT_REGION: us-east-1 28 | 29 | runs-on: ${{ matrix.platform }} 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Upgrade Pip 38 | run: python -m pip install --upgrade pip 39 | - name: Install Poetry 40 | run: python -m pip install poetry 41 | - name: Install Requirements 42 | run: | 43 | poetry config virtualenvs.in-project true 44 | poetry config virtualenvs.path .venv 45 | poetry install -vvv 46 | - name: Test Metadata 47 | run: poetry run pytest tests/unit/test_metadata.py 48 | - name: Test Session 49 | run: poetry run pytest tests/unit/test_session.py 50 | - name: Test Utils 51 | run: poetry run pytest tests/unit/test_utils.py 52 | - name: Test Moto 53 | run: poetry run pytest -n 4 tests/unit/test_moto.py 54 | -------------------------------------------------------------------------------- /.github/workflows/minimum-response-time.yml: -------------------------------------------------------------------------------- 1 | name: Issue Minimum Response Time 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: "0 */3 * * *" 6 | jobs: 7 | evaluate: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | steps: 12 | - name: Issue Minimum Response ⏰ 13 | uses: malachi-constant/issue-minimum-response@latest 14 | with: 15 | exempt_user_list: "github-actions[bot]" 16 | exempt_labels: "help wanted" 17 | exempt_authors: "malachi-constant,jaidisido,kukushking,LeonLuttenberger,cnfait,dependabot[bot]" 18 | token: ${{secrets.GITHUB_TOKEN}} 19 | label: needs-triage 20 | -------------------------------------------------------------------------------- /.github/workflows/pr-linter.yml: -------------------------------------------------------------------------------- 1 | name: Check PR title 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - opened 7 | - reopened 8 | - edited 9 | - synchronize 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | permissions: 15 | statuses: write 16 | steps: 17 | - uses: aslafy-z/conventional-pr-title-action@v3 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/snyk.yml: -------------------------------------------------------------------------------- 1 | name: Snyk 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | schedule: 7 | - cron: "0 9 * * 1" # runs each Monday at 9:00 UTC 8 | 9 | permissions: 10 | contents: read 11 | security-events: write 12 | 13 | jobs: 14 | security: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Run Snyk to check for vulnerabilities 19 | uses: snyk/actions/python-3.8@master 20 | continue-on-error: true # To make sure that SARIF upload gets called 21 | env: 22 | SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} 23 | with: 24 | args: --severity-threshold=high --sarif-file-output=snyk.sarif 25 | - name: Upload result to GitHub Code Scanning 26 | uses: github/codeql-action/upload-sarif@v3 27 | with: 28 | sarif_file: snyk.sarif 29 | -------------------------------------------------------------------------------- /.github/workflows/static-checking.yml: -------------------------------------------------------------------------------- 1 | name: Static Checking 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | Check: 17 | 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: [3.9] 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Requirements 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install poetry 33 | poetry config virtualenvs.create false --local 34 | poetry install --all-extras -vvv 35 | - name: ruff format check 36 | run: ruff format --check . 37 | - name: ruff check 38 | run: ruff check --output-format=github . 39 | - name: mypy check 40 | run: mypy --install-types --non-interactive awswrangler 41 | - name: Documentation check 42 | run: doc8 --max-line-length 120 docs/source 43 | - name: Check poetry.lock consistency with pyproject.toml 44 | run: poetry check --lock 45 | -------------------------------------------------------------------------------- /.github/workflows/unlabel-assigned-issue.yml: -------------------------------------------------------------------------------- 1 | name: Unlabel Assigned Issues 2 | on: 3 | issues: 4 | types: 5 | - assigned 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | unlabel-issue: 11 | permissions: 12 | issues: write # for andymckay/labeler to label issues 13 | pull-requests: write # for andymckay/labeler to label PRs 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: unlabel-issues 17 | uses: andymckay/labeler@master 18 | with: 19 | remove-labels: "needs-triage" 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | *__pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # poetry 30 | poetry.toml 31 | envs.toml 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | *.pytest_cache/ 55 | test-reports/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | *venv*/ 98 | *env/ 99 | venv*/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | 109 | # Pycharm project settings 110 | .idea/ 111 | 112 | # Visual Studio Code project settings 113 | .vscode/ 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | # MacOS 130 | .DS_Store 131 | 132 | # Files generated by AWS Cloudformation package 133 | output/ 134 | 135 | # Development 136 | /dev/ 137 | metrics/ 138 | python/ 139 | notes.txt 140 | 141 | # SAM 142 | .aws-sam 143 | coverage/* 144 | building/*requirements*.txt 145 | building/arrow 146 | building/lambda/arrow 147 | /docs/coverage/ 148 | /docs/build/ 149 | /docs/source/_build/ 150 | /docs/source/stubs/ 151 | 152 | # Swap 153 | *.swp 154 | 155 | # CDK 156 | node_modules 157 | *package.json 158 | *package-lock.json 159 | *.cdk.staging 160 | *cdk.out 161 | *cdk.context.json 162 | 163 | # ruff 164 | .ruff_cache/ -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: miniconda3-4.7 8 | 9 | conda: 10 | environment: docs/environment.yml 11 | 12 | sphinx: 13 | configuration: docs/source/conf.py 14 | -------------------------------------------------------------------------------- /.snyk: -------------------------------------------------------------------------------- 1 | ignore: 2 | 'SNYK-PYTHON-RDFLIB-1324490': 3 | - '* > rdflib': 4 | reason: 'No fix available' 5 | expires: '2023-06-01T00:00:00.000Z' -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | AWS SDK for pandas 2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 3.12.0 -------------------------------------------------------------------------------- /adr/0001-record-architecture-decisions.md: -------------------------------------------------------------------------------- 1 | # 1. Record architecture decisions 2 | 3 | Date: 2023-03-08 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | We need to record the architectural decisions made on this project. 12 | 13 | ## Decision 14 | 15 | We will use Architecture Decision Records, as [described by Michael Nygard](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions). 16 | 17 | ## Consequences 18 | 19 | See Michael Nygard's article, linked above. For a lightweight ADR toolset, see Nat Pryce's [adr-tools](https://github.com/npryce/adr-tools). 20 | -------------------------------------------------------------------------------- /adr/0002-handling-unsupported-arguments-in-distributed-mode.md: -------------------------------------------------------------------------------- 1 | # 2. Handling unsupported arguments in distributed mode 2 | 3 | Date: 2023-03-09 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | Many of the API functions allow the user to pass their own `boto3` session, which will then be used by all the underlying `boto3` calls. With distributed computing, one of the limitations we have is that we cannot pass the `boto3` session to the worker nodes. 12 | 13 | Boto3 session are not thread-safe, and therefore cannot be passed to Ray workers. The credentials behind a `boto3` session cannot be sent to Ray workers either, since sending credentials over the network is considered a security risk. 14 | 15 | This raises the question of what to do when, in distributed mode, the customer passes arguments that are normally supported, but aren’t supported in distributed mode. 16 | 17 | ## Decision 18 | 19 | When a user passes arguments that are unsupported by distributed mode, the function should fail immediately. 20 | 21 | The main alternative to this approach would be if a parameter such as a `boto3` session is passed, we should use it where possible. This could result in a situation where, when reading Parquet files from S3, the process of listing the files uses the `boto3` session whereas the reading of the Parquet files doesn’t. This could result in inconsistent behavior, as part of the function uses the extra parameters while the other part of it doesn’t. 22 | 23 | Another alternative would simply be to ignore the unsupported parameters, while potentially outputting a warning. The main issue with this approach is that if a customer tells our API functions to use certain parameters, they expect those parameters to be used. By ignoring them, the the AWS SDK for pandas API would be doing something different from what the customer asked, without properly notifying them, and would thus lose the customer’s trust. 24 | 25 | ## Consequences 26 | 27 | In [PR#2501](https://github.com/aws/aws-sdk-pandas/pull/2051), the `validate_distributed_kwargs` annotation was introduced which can check for the presence of arguments that are unsupported in the distributed mode. 28 | 29 | The annotation has also been applied for arguments such as `s3_additional_kwargs` and `version_id` when reading/writing data on S3. 30 | 31 | -------------------------------------------------------------------------------- /adr/0003-use-typeddict-to-group-similar-parameters.md: -------------------------------------------------------------------------------- 1 | # 3. Use TypedDict to group similar parameters 2 | 3 | Date: 2023-03-10 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | *AWS SDK for pandas* API methods contain many parameters which are related to a specific behaviour or setting. For example, methods which have an option to update the Glue AWScatalog, such as `to_csv` and `to_parquet`, contain a list of parameters that define the settings for the table in AWS Glue. These settings include the table description, column comments, the table type, etc. 12 | 13 | As a consequence, some of our functions have grown to include dozens of parameters. When reading the function signatures, it can be unclear which parameters are related to which functionality. For example, it's not immediately obvious that the parameter `column_comments` in `s3.to_parquet` only writes the column comments into the AWS Glue catalog, and not to S3. 14 | 15 | ## Decision 16 | 17 | Parameters that are related to similar functionality will be replaced by a single parameter of type [TypedDict](https://peps.python.org/pep-0589/). This will allow us to reduce the amount of parameters for our API functions, and also make it clearer that certain parameters are only related to specific functionalities. 18 | 19 | For example, parameters related to Athena cache settings will be extracted into a parameter of type `AthenaCacheSettings`, parameters related to Ray settings will be extracted into `RayReadParquetSettings`, etc. 20 | 21 | The usage of `TypedDict` allows the user to define the parameters as regular dictionaries with string keys, while empowering type checkers such as `mypy`. Alternately, implementations such as `AthenaCacheSettings` can be instantiated as classes. 22 | 23 | ### Alternatives 24 | 25 | The main alternative that was considered was the idea of using `dataclass` instead of `TypedDict`. The advantage of this alternative would be that default values for parameters could be defined directly in the class signature, rather than needing to be defined in the function which uses the parameter. 26 | 27 | On the other hand, the main issue with using `dataclass` is that it would require the customer figure out which class needs to be imported. With `TypedDict`, this is just one of the options; the parameters can simply be passed as a typical Python dictionary. 28 | 29 | This alternative was discussed in more detail as part of [PR#1855](https://github.com/aws/aws-sdk-pandas/pull/1855#issuecomment-1353618099). 30 | 31 | ## Consequences 32 | 33 | Subclasses of `TypedDict` such as `GlueCatalogParameters`, `AthenaCacheSettings`, `AthenaUNLOADSettings`, `AthenaCTASSettings` and `RaySettings` have been created. They are defined in the `wrangler.typing` module. 34 | 35 | These parameters grouping can used in either of the following two ways: 36 | ```python 37 | wr.athena.read_sql_query( 38 | "SELECT * FROM ...",, 39 | ctas_approach=True, 40 | athena_cache_settings={"max_cache_seconds": 900}, 41 | ) 42 | 43 | wr.athena.read_sql_query( 44 | "SELECT * FROM ...",, 45 | ctas_approach=True, 46 | athena_cache_settings=wr.typing.AthenaCacheSettings( 47 | max_cache_seconds=900, 48 | ), 49 | ) 50 | ``` 51 | 52 | Many of our functions signatures have been changes to take advantage of this refactor. Many of these are breaking changes which will be released as part of the next major version: `3.0.0`. 53 | -------------------------------------------------------------------------------- /adr/0004-no-alter-iam-permissions.md: -------------------------------------------------------------------------------- 1 | # 4. AWS SDK for pandas does not alter IAM permissions 2 | 3 | Date: 2023-03-15 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | AWS SDK for pandas requires permissions to execute AWS API calls. Permissions are granted using AWS Identity and 12 | Access Management Policies that are attached to IAM entities - users or roles. 13 | 14 | ## Decision 15 | 16 | AWS SDK for pandas does not alter (create, update, delete) IAM permissions policies attached to the IAM entities. 17 | 18 | ## Consequences 19 | 20 | It is users responsibility to ensure IAM entities they are using to execute the calls have the required permissions. -------------------------------------------------------------------------------- /adr/0005-move-dependencies-to-optional.md: -------------------------------------------------------------------------------- 1 | # 5. Move dependencies to optional 2 | 3 | Date: 2023-03-15 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | AWS SDK for pandas relies on external dependencies in some of its modules. These include `redshift-connector`, `gremlinpython` and `pymysql` to cite a few. 12 | 13 | In versions 2.x and below, most of these packages were set as required, meaning they were installed regardless of whether the user actually needed them. This has introduced two major risks and issues as the number of dependencies increased: 14 | 1. **Security risk**: Unused dependencies increase the attack surface to manage. Users must scan them and ensure that they are kept up to date even though they don't need them 15 | 2. **Dependency hell**: Users must resolve dependencies for packages that they are not using. It can lead to dependency hell and prevent critical updates related to security patches and major bugs 16 | 17 | ## Decision 18 | 19 | A breaking change is introduced in version 3.x where the number of required dependencies is reduced to the most important ones, namely: 20 | * boto3 21 | * pandas 22 | * numpy 23 | * pyarrow 24 | * typing-extensions 25 | 26 | ## Consequences 27 | 28 | All other dependencies are moved to optional and must be installed by the user separately using pip install `awswrangler[dependency]`. For instance, the command to use the redshift APIs is `pip install awswrangler[redshift]`. Failing to do so raises an exception informing the user that the package is missing and how to install it 29 | -------------------------------------------------------------------------------- /adr/0006-deprecate-s3-merge-upsert-table.md: -------------------------------------------------------------------------------- 1 | # 6. Deprecate wr.s3.merge_upsert_table 2 | 3 | Date: 2023-03-15 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | AWS SDK for pandas `wr.s3.merge_upsert_table` is used to perform upsert (update else insert) onto an existing AWS Glue 12 | Data Catalog table. It is a much simplified version of upsert functionality that is supported natively by Apache Hudi 13 | and Athena Iceberg tables, and does not, for example, handle partitioned datasets. 14 | 15 | ## Decision 16 | 17 | To avoid poor user experience `wr.s3.merge_upsert_table` is deprecated and will be removed in 3.0 release. 18 | 19 | ## Consequences 20 | 21 | In [PR#2076](https://github.com/aws/aws-sdk-pandas/pull/2076), `wr.s3.merge_upsert_table` function was removed. 22 | -------------------------------------------------------------------------------- /adr/0007-design-of-engine-and-memory-format.md: -------------------------------------------------------------------------------- 1 | # 7. Design of engine and memory format 2 | 3 | Date: 2023-03-16 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | Ray and Modin are the two frameworks used to support running `awswrangler` APIs at scale. Adding them to the codebase requires significant refactoring work. The original approach considered was to handle both distributed and non-distributed code within the same modules. This quickly turned out to be undesirable as it affected the readability, maintainability and scalability of the codebase. 12 | 13 | ## Decision 14 | 15 | Version 3.x of the library introduces two new constructs, `engine` and `memory_format`, which are designed to address the aforementioned shortcomings of the original approach, but also provide additional functionality. 16 | 17 | Currently `engine` takes one of two values: `python` (default) or `ray`, but additional engines could be onboarded in the future. The value is determined at import based on installed dependencies. The user can override this value with `wr.engine.set("engine_name")`. Likewise, `memory_format` can be set to `pandas` (default) or `modin` and overridden with `wr.memory_format.set("memory_format_name")`. 18 | 19 | A custom dispatcher is used to register functions based on the execution and memory format values. For instance, if the `ray` engine is detected at import, then methods distributed with Ray are used instead of the default AWS SDK for pandas code. 20 | 21 | ## Consequences 22 | 23 | __The good__: 24 | 25 | *Clear separation of concerns*: Distributed methods live outside non-distributed code, eliminating ugly if conditionals, allowing both to scale independently and making them easier to maintain in the future 26 | 27 | *Better dispatching*: Adding a new engine/memory format is as simple as creating a new directory with its methods and registering them with the custom dispatcher based on the value of the engine or memory format 28 | 29 | *Custom engine/memory format classes*: Give more flexibility than config when it comes to interacting with the engine and managing its state (initialising, registering, get/setting...) 30 | 31 | __The bad__: 32 | 33 | *Managing state*: Adding a custom dispatcher means that we must maintain its state. For instance, unregistering methods when a user sets a different engine (e.g. moving from ray to dask at execution time) is currently unsupported 34 | 35 | *Detecting the engine*: Conditionals are simpler/easier when it comes to detecting an engine. With a custom dispatcher, the registration and dispatching process is more opaque/convoluted. For example, there is a higher risk of not realising that we are using a given engine vs another 36 | 37 | __The ugly__: 38 | 39 | *Unused arguments*: Each method registered with the dispatcher must accept the union of both non-distributed and distributed arguments, even though some would be unused. As the list of supported engines grows, so does the number of unused arguments. It also means that we must maintain the same list of arguments across the different versions of the method -------------------------------------------------------------------------------- /adr/0008-switching-between-pyarrow-and-pandas-based-datasources-for-csv-json-i-o.md: -------------------------------------------------------------------------------- 1 | # 8. Switching between PyArrow and Pandas based datasources for CSV/JSON I/O 2 | 3 | Date: 2023-03-16 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | The reading and writing operations for CSV/JSON data in *AWS SDK for pandas* make use of the underlying functions in Pandas. For example, `wr.s3.read_csv` will open a stream of data from S3 and then invoke `pandas.read_csv`. This allows the library to fully support all the arguments which are supported by the underlying Pandas functions. Functions such as `wr.s3.read_csv` or `wr.s3.to_json` accept a `**kwargs` parameter which forwards all parameters to `pandas.read_csv` and `pandas.to_json` automatically. 12 | 13 | From version 3.0.0 onward, *AWS SDK for pandas* supports Ray and Modin. When those two libraries are installed, all aforementioned I/O functions will be distributed on a Ray cluster. In the background, this means that all the I/O functions for S3 are running as part of a [custom Ray data source](https://docs.ray.io/en/latest/_modules/ray/data/datasource/datasource.html). Data is then returned in blocks, which form the Modin DataFrame. 14 | 15 | The issue is that the Pandas I/O functions work very slowly in the Ray datasource compared with the equivalent I/O functions in PyArrow. Therefore, calling `pyarrow.csv.read_csv` is significantly faster than calling `pandas.read_csv` in the background. 16 | 17 | However, the PyArrow I/O functions do not support the same set of parameters as the ones in Pandas. As a consequence, whereas the PyArrow functions offer greater performance, they come at the cost of feature parity between the non-distributed mode and the distributed mode. 18 | 19 | For reference, loading 5 GiB of CSV data with the PyArrow functions took around 30 seconds, compared to 120 seconds with the Pandas functions in the same scenario. 20 | For writing back to S3, the speed-up is around 2x. 21 | 22 | ## Decision 23 | 24 | In order to maximize both performance without losing feature parity, we implemented logic whereby if the user passes a set of parameters which are supported by PyArrow, the library uses PyArrow for reading/writing. If not, the library defaults to the slower Pandas functions, which will support the set of parameter. 25 | 26 | The following example will illustrate the difference: 27 | 28 | ```python 29 | # This will be loaded by PyArrow, as `doublequote` is supported 30 | wr.s3.read_csv( 31 | path="s3://my-bucket/my-path/", 32 | dataset=True, 33 | doublequote=False, 34 | ) 35 | 36 | # This will be loaded using the Pandas I/O functions, as `comment` is not supported by PyArrow 37 | wr.s3.read_csv( 38 | path="s3://my-bucket/my-path/", 39 | dataset=True, 40 | comment="#", 41 | ) 42 | ``` 43 | 44 | This logic is applied to the following functions: 45 | 1. `wr.s3.read_csv` 46 | 2. `wr.s3.read_json` 47 | 3. `wr.s3.to_json` 48 | 4. `wr.s3.to_csv` 49 | 50 | ## Consequences 51 | 52 | The logic of switching between using PyArrow or Pandas functions in background was implemented as part of [#1699](https://github.com/aws/aws-sdk-pandas/pull/1699). It was later expanded to support more parameters in [#2008](https://github.com/aws/aws-sdk-pandas/pull/2008) and [#2019](https://github.com/aws/aws-sdk-pandas/pull/2019). 53 | -------------------------------------------------------------------------------- /adr/0009-lazy-engine-initialization.md: -------------------------------------------------------------------------------- 1 | # 9. Engine selection and lazy initialization 2 | 3 | Date: 2023-05-17 4 | 5 | ## Status 6 | 7 | Accepted 8 | 9 | ## Context 10 | 11 | In distributed mode, three approaches are possible when it comes to selecting and initializing a Ray engine: 12 | 1. Initialize the Ray runtime at import (current default). This option causes the least friction to the user but assumes that installing Ray as an optional dependency is enough to enable distributed mode. Moreover, the user cannot prevent/delay Ray initialization (as it's done at import) 13 | 2. Initialize the Ray runtime on the first distributed API call. The user can prevent Ray initialization by switching the engine/memory format with environment variables or between import and the first awswrangler distributed API call. However, by default this approach still assumes that installing Ray is equivalent to enabling distributed mode 14 | 3. Wait for the user to enable distributed mode, via environment variables and/or via `wr.engine.set`. This option makes no assumption on which mode to use (distributed vs non-distributed). Non-distributed would be the default and it's up to the user to switch the engine/memory format 15 | 16 | ## Decision 17 | 18 | Option #1 is inflexible and gives little control to the user, while option #3 introduces too much friction and puts the burden on the user. Option #2 on the other hand gives full flexibility to the user while providing a sane default. 19 | 20 | ## Consequences 21 | 22 | The only difference between the current default and the suggested approach is to delay engine initialization, which is not a breaking change. However, it means that in certain situations more than one Ray instance is initialized. For instance, when running tests across multiple threads, each thread runs its own Ray runtime. 23 | -------------------------------------------------------------------------------- /awswrangler/__init__.py: -------------------------------------------------------------------------------- 1 | """Initial Module. 2 | 3 | Source repository: https://github.com/aws/aws-sdk-pandas 4 | Documentation: https://aws-sdk-pandas.readthedocs.io/ 5 | 6 | """ 7 | 8 | import logging as _logging 9 | 10 | from awswrangler import ( 11 | athena, 12 | catalog, 13 | chime, 14 | cleanrooms, 15 | cloudwatch, 16 | data_api, 17 | data_quality, 18 | dynamodb, 19 | emr, 20 | emr_serverless, 21 | exceptions, 22 | mysql, 23 | neptune, 24 | opensearch, 25 | oracle, 26 | postgresql, 27 | quicksight, 28 | redshift, 29 | s3, 30 | secretsmanager, 31 | sqlserver, 32 | sts, 33 | timestream, 34 | typing, 35 | ) 36 | from awswrangler.__metadata__ import __description__, __license__, __title__, __version__ 37 | from awswrangler._config import config 38 | from awswrangler._distributed import EngineEnum, MemoryFormatEnum, engine, memory_format 39 | 40 | engine.register() 41 | 42 | __all__ = [ 43 | "athena", 44 | "catalog", 45 | "chime", 46 | "cleanrooms", 47 | "cloudwatch", 48 | "emr", 49 | "emr_serverless", 50 | "data_api", 51 | "data_quality", 52 | "dynamodb", 53 | "exceptions", 54 | "opensearch", 55 | "oracle", 56 | "quicksight", 57 | "s3", 58 | "sts", 59 | "redshift", 60 | "mysql", 61 | "neptune", 62 | "postgresql", 63 | "secretsmanager", 64 | "sqlserver", 65 | "config", 66 | "engine", 67 | "memory_format", 68 | "timestream", 69 | "typing", 70 | "__description__", 71 | "__license__", 72 | "__title__", 73 | "__version__", 74 | "EngineEnum", 75 | "MemoryFormatEnum", 76 | ] 77 | 78 | 79 | _logging.getLogger("awswrangler").addHandler(_logging.NullHandler()) 80 | -------------------------------------------------------------------------------- /awswrangler/__metadata__.py: -------------------------------------------------------------------------------- 1 | """Metadata Module. 2 | 3 | Source repository: https://github.com/aws/aws-sdk-pandas 4 | Documentation: https://aws-sdk-pandas.readthedocs.io/ 5 | 6 | """ 7 | 8 | __title__: str = "awswrangler" 9 | __description__: str = "Pandas on AWS." 10 | __version__: str = "3.12.0" 11 | __license__: str = "Apache License 2.0" 12 | -------------------------------------------------------------------------------- /awswrangler/_executor.py: -------------------------------------------------------------------------------- 1 | """Executor Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import concurrent.futures 6 | import itertools 7 | import logging 8 | from abc import ABC, abstractmethod 9 | from typing import TYPE_CHECKING, Any, Callable, TypeVar 10 | 11 | from awswrangler import _utils 12 | from awswrangler._distributed import engine 13 | 14 | if TYPE_CHECKING: 15 | from botocore.client import BaseClient 16 | 17 | _logger: logging.Logger = logging.getLogger(__name__) 18 | 19 | 20 | MapOutputType = TypeVar("MapOutputType") 21 | 22 | 23 | class _BaseExecutor(ABC): 24 | def __init__(self) -> None: 25 | _logger.debug("Creating an %s executor: ", self.__class__) 26 | 27 | @abstractmethod 28 | def map( 29 | self, 30 | func: Callable[..., MapOutputType], 31 | boto3_client: "BaseClient" | None, 32 | *args: Any, 33 | ) -> list[MapOutputType]: 34 | pass 35 | 36 | 37 | class _ThreadPoolExecutor(_BaseExecutor): 38 | def __init__(self, use_threads: bool | int): 39 | super().__init__() 40 | self._exec: concurrent.futures.ThreadPoolExecutor | None = None 41 | self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) 42 | if self._cpus > 1: 43 | _logger.debug("Initializing ThreadPoolExecutor with %d workers", self._cpus) 44 | self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus) 45 | 46 | def map( 47 | self, func: Callable[..., MapOutputType], boto3_client: "BaseClient" | None, *args: Any 48 | ) -> list[MapOutputType]: 49 | """Map iterables to multi-threaded function.""" 50 | _logger.debug("Map: %s", func) 51 | if self._exec is not None: 52 | iterables = (itertools.repeat(boto3_client), *args) 53 | return list(self._exec.map(func, *iterables)) 54 | # Single-threaded 55 | return list(map(func, *(itertools.repeat(boto3_client), *args))) 56 | 57 | 58 | @engine.dispatch_on_engine 59 | def _get_executor(use_threads: bool | int, **kwargs: Any) -> _BaseExecutor: 60 | # kwargs allows for parameter that will be used by other variants of this function, 61 | # such as `parallelism` for _get_ray_executor 62 | return _ThreadPoolExecutor(use_threads) 63 | -------------------------------------------------------------------------------- /awswrangler/_sql_utils.py: -------------------------------------------------------------------------------- 1 | """SQL utilities.""" 2 | 3 | import re 4 | 5 | from awswrangler import exceptions 6 | 7 | 8 | def identifier(sql: str, sql_mode: str = "mysql") -> str: 9 | """ 10 | Turn the input into an escaped SQL identifier, such as the name of a table or column. 11 | 12 | sql: str 13 | Identifier to use in SQL. 14 | sql_mode: str 15 | "mysql" for default MySQL identifiers (backticks), "ansi" for ANSI-compatible identifiers (double quotes), or 16 | "mssql" for MSSQL identifiers (square brackets). 17 | 18 | Returns 19 | ------- 20 | str 21 | Escaped SQL identifier. 22 | """ 23 | if not isinstance(sql, str): 24 | raise exceptions.InvalidArgumentValue("identifier must be a str") 25 | 26 | if len(sql) == 0: 27 | raise exceptions.InvalidArgumentValue("identifier must be > 0 characters in length") 28 | 29 | if re.search(r"[^a-zA-Z0-9-_ ]", sql): 30 | raise exceptions.InvalidArgumentValue( 31 | "identifier must contain only alphanumeric characters, spaces, underscores, or hyphens" 32 | ) 33 | 34 | if sql_mode == "mysql": 35 | return f"`{sql}`" 36 | elif sql_mode == "ansi": 37 | return f'"{sql}"' 38 | elif sql_mode == "mssql": 39 | return f"[{sql}]" 40 | 41 | raise ValueError(f"Unknown SQL MODE: {sql_mode}") 42 | -------------------------------------------------------------------------------- /awswrangler/annotations.py: -------------------------------------------------------------------------------- 1 | """Annotations Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import warnings 6 | from functools import wraps 7 | from typing import Any, Callable, TypeVar, cast 8 | 9 | from awswrangler._config import _insert_str, config 10 | 11 | FunctionType = TypeVar("FunctionType", bound=Callable[..., Any]) 12 | 13 | 14 | class SDKPandasDeprecatedWarning(Warning): 15 | """Deprecated Warning.""" 16 | 17 | 18 | class SDKPandasExperimentalWarning(Warning): 19 | """Experimental Warning.""" 20 | 21 | 22 | def _inject_note( 23 | doc: str | None, 24 | message: str, 25 | ) -> str | None: 26 | token: str = "\n Parameters" 27 | if not doc or token not in doc: 28 | return doc 29 | note: str = f"\n\n Warning\n ----\n {message}\n\n" 30 | return _insert_str(text=doc, token=token, insert=note) 31 | 32 | 33 | def warn_message( 34 | message: str, 35 | warning_class: type[Warning], 36 | stacklevel: int = 2, 37 | ) -> Callable[[FunctionType], FunctionType]: 38 | """Decorate functions with this to print warnings.""" 39 | 40 | def decorator(func: FunctionType) -> FunctionType: 41 | @wraps(func) 42 | def inner(*args: Any, **kwargs: Any) -> Any: 43 | if not config.suppress_warnings: 44 | warnings.warn(f"`{func.__name__}`: {message}", warning_class, stacklevel=stacklevel) 45 | 46 | return func(*args, **kwargs) 47 | 48 | inner.__doc__ = _inject_note( 49 | doc=func.__doc__, 50 | message=message, 51 | ) 52 | 53 | return cast(FunctionType, inner) 54 | 55 | return decorator 56 | 57 | 58 | Deprecated = warn_message( 59 | "This API is deprecated and will be removed in future AWS SDK for Pandas releases. ", 60 | SDKPandasDeprecatedWarning, 61 | ) 62 | 63 | 64 | Experimental = warn_message( 65 | "This API is experimental and may change in future AWS SDK for Pandas releases. ", 66 | SDKPandasExperimentalWarning, 67 | ) 68 | -------------------------------------------------------------------------------- /awswrangler/athena/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon Athena Module.""" 2 | 3 | from awswrangler.athena._executions import ( # noqa 4 | get_query_execution, 5 | stop_query_execution, 6 | start_query_execution, 7 | wait_query, 8 | ) 9 | from awswrangler.athena._spark import create_spark_session, run_spark_calculation 10 | from awswrangler.athena._statements import ( 11 | create_prepared_statement, 12 | delete_prepared_statement, 13 | list_prepared_statements, 14 | ) 15 | from awswrangler.athena._read import ( 16 | get_query_results, 17 | read_sql_query, 18 | read_sql_table, 19 | unload, 20 | ) 21 | from awswrangler.athena._utils import ( 22 | create_athena_bucket, 23 | create_ctas_table, 24 | describe_table, 25 | generate_create_query, 26 | get_named_query_statement, 27 | get_query_columns_types, 28 | get_query_executions, 29 | get_work_group, 30 | list_query_executions, 31 | repair_table, 32 | show_create_table, 33 | ) 34 | from awswrangler.athena._write_iceberg import to_iceberg, delete_from_iceberg_table 35 | 36 | 37 | __all__ = [ 38 | "read_sql_query", 39 | "read_sql_table", 40 | "create_athena_bucket", 41 | "describe_table", 42 | "get_query_columns_types", 43 | "get_query_execution", 44 | "get_query_executions", 45 | "get_query_results", 46 | "get_named_query_statement", 47 | "get_work_group", 48 | "generate_create_query", 49 | "list_query_executions", 50 | "repair_table", 51 | "create_spark_session", 52 | "run_spark_calculation", 53 | "create_ctas_table", 54 | "show_create_table", 55 | "start_query_execution", 56 | "stop_query_execution", 57 | "unload", 58 | "wait_query", 59 | "create_prepared_statement", 60 | "list_prepared_statements", 61 | "delete_prepared_statement", 62 | "to_iceberg", 63 | "delete_from_iceberg_table", 64 | ] 65 | -------------------------------------------------------------------------------- /awswrangler/athena/_executions.pyi: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Any, 3 | Literal, 4 | overload, 5 | ) 6 | 7 | import boto3 8 | 9 | from awswrangler import typing 10 | 11 | @overload 12 | def start_query_execution( 13 | sql: str, 14 | database: str | None = ..., 15 | s3_output: str | None = ..., 16 | workgroup: str = ..., 17 | encryption: str | None = ..., 18 | kms_key: str | None = ..., 19 | params: dict[str, Any] | list[str] | None = ..., 20 | paramstyle: Literal["qmark", "named"] = ..., 21 | boto3_session: boto3.Session | None = ..., 22 | athena_cache_settings: typing.AthenaCacheSettings | None = ..., 23 | athena_query_wait_polling_delay: float = ..., 24 | data_source: str | None = ..., 25 | wait: Literal[False] = ..., 26 | ) -> str: ... 27 | @overload 28 | def start_query_execution( 29 | sql: str, 30 | *, 31 | database: str | None = ..., 32 | s3_output: str | None = ..., 33 | workgroup: str = ..., 34 | encryption: str | None = ..., 35 | kms_key: str | None = ..., 36 | params: dict[str, Any] | list[str] | None = ..., 37 | paramstyle: Literal["qmark", "named"] = ..., 38 | boto3_session: boto3.Session | None = ..., 39 | athena_cache_settings: typing.AthenaCacheSettings | None = ..., 40 | athena_query_wait_polling_delay: float = ..., 41 | data_source: str | None = ..., 42 | wait: Literal[True], 43 | ) -> dict[str, Any]: ... 44 | @overload 45 | def start_query_execution( 46 | sql: str, 47 | *, 48 | database: str | None = ..., 49 | s3_output: str | None = ..., 50 | workgroup: str = ..., 51 | encryption: str | None = ..., 52 | kms_key: str | None = ..., 53 | params: dict[str, Any] | list[str] | None = ..., 54 | paramstyle: Literal["qmark", "named"] = ..., 55 | boto3_session: boto3.Session | None = ..., 56 | athena_cache_settings: typing.AthenaCacheSettings | None = ..., 57 | athena_query_wait_polling_delay: float = ..., 58 | data_source: str | None = ..., 59 | wait: bool, 60 | ) -> str | dict[str, Any]: ... 61 | def stop_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = ...) -> None: ... 62 | def wait_query( 63 | query_execution_id: str, 64 | boto3_session: boto3.Session | None = None, 65 | athena_query_wait_polling_delay: float = ..., 66 | ) -> dict[str, Any]: ... 67 | def get_query_execution(query_execution_id: str, boto3_session: boto3.Session | None = ...) -> dict[str, Any]: ... 68 | -------------------------------------------------------------------------------- /awswrangler/catalog/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon Glue Catalog Module.""" 2 | 3 | from awswrangler.catalog._add import ( 4 | add_column, 5 | add_csv_partitions, 6 | add_json_partitions, 7 | add_orc_partitions, 8 | add_parquet_partitions, 9 | ) 10 | from awswrangler.catalog._create import ( 11 | _create_csv_table, 12 | _create_json_table, 13 | _create_parquet_table, 14 | create_csv_table, 15 | create_database, 16 | create_json_table, 17 | create_orc_table, 18 | create_parquet_table, 19 | overwrite_table_parameters, 20 | upsert_table_parameters, 21 | ) 22 | from awswrangler.catalog._delete import ( 23 | delete_all_partitions, 24 | delete_column, 25 | delete_database, 26 | delete_partitions, 27 | delete_table_if_exists, 28 | ) 29 | from awswrangler.catalog._get import ( 30 | _get_table_input, 31 | databases, 32 | get_columns_comments, 33 | get_columns_parameters, 34 | get_connection, 35 | get_csv_partitions, 36 | get_databases, 37 | get_parquet_partitions, 38 | get_partitions, 39 | get_table_description, 40 | get_table_location, 41 | get_table_number_of_versions, 42 | get_table_parameters, 43 | get_table_types, 44 | get_table_versions, 45 | get_tables, 46 | search_tables, 47 | table, 48 | tables, 49 | ) 50 | from awswrangler.catalog._utils import ( 51 | does_table_exist, 52 | drop_duplicated_columns, 53 | extract_athena_types, 54 | rename_duplicated_columns, 55 | sanitize_column_name, 56 | sanitize_dataframe_columns_names, 57 | sanitize_table_name, 58 | ) 59 | 60 | __all__ = [ 61 | "add_column", 62 | "add_csv_partitions", 63 | "add_json_partitions", 64 | "add_parquet_partitions", 65 | "add_orc_partitions", 66 | "does_table_exist", 67 | "delete_column", 68 | "drop_duplicated_columns", 69 | "extract_athena_types", 70 | "rename_duplicated_columns", 71 | "sanitize_column_name", 72 | "sanitize_dataframe_columns_names", 73 | "sanitize_table_name", 74 | "_create_csv_table", 75 | "_create_parquet_table", 76 | "_create_json_table", 77 | "create_csv_table", 78 | "create_database", 79 | "create_parquet_table", 80 | "create_orc_table", 81 | "create_json_table", 82 | "overwrite_table_parameters", 83 | "upsert_table_parameters", 84 | "_get_table_input", 85 | "databases", 86 | "get_columns_comments", 87 | "get_columns_parameters", 88 | "get_connection", 89 | "get_csv_partitions", 90 | "get_databases", 91 | "get_parquet_partitions", 92 | "get_partitions", 93 | "get_table_description", 94 | "get_table_location", 95 | "get_table_number_of_versions", 96 | "get_table_parameters", 97 | "get_table_types", 98 | "get_table_versions", 99 | "get_tables", 100 | "search_tables", 101 | "table", 102 | "tables", 103 | "delete_database", 104 | "delete_table_if_exists", 105 | "delete_partitions", 106 | "delete_all_partitions", 107 | ] 108 | -------------------------------------------------------------------------------- /awswrangler/chime.py: -------------------------------------------------------------------------------- 1 | """Chime Message/Notification module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import json 6 | import logging 7 | from typing import Any 8 | from urllib.error import HTTPError, URLError 9 | from urllib.request import Request, urlopen 10 | 11 | _logger: logging.Logger = logging.getLogger(__name__) 12 | 13 | 14 | def post_message(webhook: str, message: str) -> Any | None: 15 | """Send message on an existing Chime Chat rooms. 16 | 17 | Parameters 18 | ---------- 19 | webhook 20 | Contains all the authentication information to send the message 21 | message 22 | The actual message which needs to be posted on Slack channel 23 | 24 | Returns 25 | ------- 26 | The response from Chime 27 | """ 28 | response = None 29 | chime_message = {"Content": f"Message: {message}"} 30 | req = Request(webhook, json.dumps(chime_message).encode("utf-8")) 31 | try: 32 | response = urlopen(req) 33 | _logger.info("Message posted on Chime. Got respone as %s", response.read()) 34 | except HTTPError as e: 35 | _logger.exception("Request failed: %d %s", e.code, e.reason) 36 | except URLError as e: 37 | _logger.exception("Server connection failed: %s", e.reason) 38 | return response 39 | -------------------------------------------------------------------------------- /awswrangler/cleanrooms/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon Clean Rooms Module.""" 2 | 3 | from awswrangler.cleanrooms._read import read_sql_query 4 | from awswrangler.cleanrooms._utils import wait_query 5 | 6 | __all__ = [ 7 | "read_sql_query", 8 | "wait_query", 9 | ] 10 | -------------------------------------------------------------------------------- /awswrangler/cleanrooms/_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities Module for Amazon Clean Rooms.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import time 7 | from typing import TYPE_CHECKING 8 | 9 | import boto3 10 | 11 | from awswrangler import _utils, exceptions 12 | 13 | if TYPE_CHECKING: 14 | from mypy_boto3_cleanrooms.type_defs import GetProtectedQueryOutputTypeDef 15 | 16 | _QUERY_FINAL_STATES: list[str] = ["CANCELLED", "FAILED", "SUCCESS", "TIMED_OUT"] 17 | _QUERY_WAIT_POLLING_DELAY: float = 2 # SECONDS 18 | 19 | _logger: logging.Logger = logging.getLogger(__name__) 20 | 21 | 22 | def wait_query( 23 | membership_id: str, query_id: str, boto3_session: boto3.Session | None = None 24 | ) -> "GetProtectedQueryOutputTypeDef": 25 | """Wait for the Clean Rooms protected query to end. 26 | 27 | Parameters 28 | ---------- 29 | membership_id 30 | Membership ID 31 | query_id 32 | Protected query execution ID 33 | boto3_session 34 | The default boto3 session will be used if **boto3_session** is ``None``. 35 | 36 | Returns 37 | ------- 38 | ``Dict[str, Any]`` 39 | Dictionary with the get_protected_query response. 40 | 41 | Raises 42 | ------ 43 | exceptions.QueryFailed 44 | Raises exception with error message if protected query is cancelled, times out or fails. 45 | 46 | Examples 47 | -------- 48 | >>> import awswrangler as wr 49 | >>> res = wr.cleanrooms.wait_query(membership_id='membership-id', query_id='query-id') 50 | """ 51 | client_cleanrooms = _utils.client(service_name="cleanrooms", session=boto3_session) 52 | state = "SUBMITTED" 53 | 54 | while state not in _QUERY_FINAL_STATES: 55 | time.sleep(_QUERY_WAIT_POLLING_DELAY) 56 | response = client_cleanrooms.get_protected_query( 57 | membershipIdentifier=membership_id, protectedQueryIdentifier=query_id 58 | ) 59 | state = response["protectedQuery"].get("status") # type: ignore[assignment] 60 | 61 | _logger.debug("state: %s", state) 62 | if state != "SUCCESS": 63 | raise exceptions.QueryFailed(response["protectedQuery"].get("Error")) 64 | return response 65 | -------------------------------------------------------------------------------- /awswrangler/data_api/__init__.py: -------------------------------------------------------------------------------- 1 | """Data API Service Module for RDS and Redshift.""" 2 | 3 | from awswrangler.data_api import rds, redshift 4 | 5 | __all__ = [ 6 | "redshift", 7 | "rds", 8 | ] 9 | -------------------------------------------------------------------------------- /awswrangler/data_quality/__init__.py: -------------------------------------------------------------------------------- 1 | """AWS Glue Data Quality package.""" 2 | 3 | from awswrangler.data_quality._create import ( 4 | create_recommendation_ruleset, 5 | create_ruleset, 6 | evaluate_ruleset, 7 | update_ruleset, 8 | ) 9 | from awswrangler.data_quality._get import get_ruleset 10 | 11 | __all__ = [ 12 | "create_recommendation_ruleset", 13 | "create_ruleset", 14 | "evaluate_ruleset", 15 | "get_ruleset", 16 | "update_ruleset", 17 | ] 18 | -------------------------------------------------------------------------------- /awswrangler/data_quality/_get.py: -------------------------------------------------------------------------------- 1 | """AWS Glue Data Quality Get Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import cast 6 | 7 | import boto3 8 | 9 | import awswrangler.pandas as pd 10 | from awswrangler.data_quality._utils import _get_ruleset, _rules_to_df 11 | 12 | 13 | def get_ruleset( 14 | name: str | list[str], 15 | boto3_session: boto3.Session | None = None, 16 | ) -> pd.DataFrame: 17 | """Get a Data Quality ruleset. 18 | 19 | Parameters 20 | ---------- 21 | name 22 | Ruleset name or list of names. 23 | boto3_session 24 | The default boto3 session will be used if **boto3_session** is ``None``. 25 | 26 | Returns 27 | ------- 28 | Data frame with ruleset(s) details. 29 | 30 | Examples 31 | -------- 32 | Get single ruleset 33 | >>> import awswrangler as wr 34 | >>> df_ruleset = wr.data_quality.get_ruleset(name="my_ruleset") 35 | 36 | Get multiple rulesets. A column with the ruleset name is added to the data frame 37 | >>> import awswrangler as wr 38 | >>> df_rulesets = wr.data_quality.get_ruleset(name=["ruleset_1", "ruleset_2"]) 39 | """ 40 | ruleset_names: list[str] = name if isinstance(name, list) else [name] 41 | dfs: list[pd.DataFrame] = [] 42 | for ruleset_name in ruleset_names: 43 | rules = cast(str, _get_ruleset(ruleset_name=ruleset_name, boto3_session=boto3_session)["Ruleset"]) 44 | df = _rules_to_df(rules=rules) 45 | if len(ruleset_names) > 1: 46 | df["ruleset"] = ruleset_name 47 | dfs.append(df) 48 | return pd.concat(dfs) 49 | -------------------------------------------------------------------------------- /awswrangler/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | """Distributed Module.""" 2 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/__init__.py: -------------------------------------------------------------------------------- 1 | """Ray Module.""" 2 | 3 | from awswrangler.distributed.ray._core import RayLogger, initialize_ray, ray_get, ray_logger, ray_remote 4 | 5 | __all__ = [ 6 | "RayLogger", 7 | "initialize_ray", 8 | "ray_get", 9 | "ray_logger", 10 | "ray_remote", 11 | ] 12 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/_core.pyi: -------------------------------------------------------------------------------- 1 | """Ray Module.""" 2 | 3 | import logging 4 | from typing import Any, Callable 5 | 6 | class RayLogger: 7 | def __init__( 8 | self, 9 | log_level: int = logging.INFO, 10 | format: str = "%(asctime)s::%(levelname)-2s::%(name)s::%(message)s", 11 | datefmt: str = "%Y-%m-%d %H:%M:%S", 12 | ): ... 13 | def get_logger(self, name: str | Any = None) -> logging.Logger: ... 14 | 15 | def ray_logger(function: Callable[..., Any]) -> Callable[..., Any]: ... 16 | def ray_remote(**options: Any) -> Callable[..., Any]: ... 17 | def ray_get(futures: list[Any]) -> Any: ... 18 | def initialize_ray( 19 | address: str | None = None, 20 | redis_password: str | None = None, 21 | ignore_reinit_error: bool | None = True, 22 | include_dashboard: bool | None = False, 23 | log_to_driver: bool | None = True, 24 | object_store_memory: int | None = None, 25 | cpu_count: int | None = None, 26 | gpu_count: int | None = 0, 27 | ) -> None: ... 28 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/_executor.py: -------------------------------------------------------------------------------- 1 | """Ray Executor Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import itertools 6 | import logging 7 | from typing import TYPE_CHECKING, Any, Callable, TypeVar 8 | 9 | import ray 10 | import ray.actor 11 | 12 | from awswrangler import engine 13 | from awswrangler._executor import _BaseExecutor 14 | 15 | if TYPE_CHECKING: 16 | from botocore.client import BaseClient 17 | 18 | _logger: logging.Logger = logging.getLogger(__name__) 19 | 20 | MapOutputType = TypeVar("MapOutputType") 21 | 22 | 23 | class _RayExecutor(_BaseExecutor): 24 | def map(self, func: Callable[..., MapOutputType], _: "BaseClient" | None, *args: Any) -> list[MapOutputType]: 25 | """Map func and return ray futures.""" 26 | _logger.debug("Ray map: %s", func) 27 | # Discard boto3 client 28 | return list(func(*arg) for arg in zip(itertools.repeat(None), *args)) 29 | 30 | 31 | @ray.remote 32 | class AsyncActor: 33 | async def run_concurrent(self, func: Callable[..., MapOutputType], *args: Any) -> MapOutputType: 34 | return func(*args) 35 | 36 | 37 | class _RayMaxConcurrencyExecutor(_BaseExecutor): 38 | def __init__(self, max_concurrency: int) -> None: 39 | super().__init__() 40 | 41 | _logger.debug("Initializing Ray Actor with maximum concurrency %d", max_concurrency) 42 | self._actor: ray.actor.ActorHandle = AsyncActor.options(max_concurrency=max_concurrency).remote() # type: ignore[attr-defined] 43 | 44 | def map(self, func: Callable[..., MapOutputType], _: "BaseClient" | None, *args: Any) -> list[MapOutputType]: 45 | """Map func and return ray futures.""" 46 | _logger.debug("Ray map: %s", func) 47 | 48 | # Discard boto3 client 49 | iterables = (itertools.repeat(None), *args) 50 | func_python = engine.dispatch_func(func, "python") 51 | 52 | return [self._actor.run_concurrent.remote(func_python, *arg) for arg in zip(*iterables)] 53 | 54 | 55 | def _get_ray_executor(use_threads: bool | int, **kwargs: Any) -> _BaseExecutor: 56 | # We want the _RayMaxConcurrencyExecutor only to be used when the `parallelism` parameter is specified 57 | parallelism: int | None = kwargs.get("ray_parallelism") 58 | return _RayMaxConcurrencyExecutor(parallelism) if parallelism else _RayExecutor() 59 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/_utils.py: -------------------------------------------------------------------------------- 1 | """Ray utilities (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import ray 6 | from ray.util.placement_group import PlacementGroup 7 | 8 | 9 | # https://github.com/ray-project/ray/blob/master/python/ray/data/_internal/util.py#L87 10 | def _estimate_avail_cpus(cur_pg: PlacementGroup | None) -> int: 11 | """ 12 | Estimates the available CPU parallelism for this Dataset in the cluster. 13 | 14 | If we aren't in a placement group, this is trivially the number of CPUs in the 15 | cluster. Otherwise, we try to calculate how large the placement group is relative 16 | to the size of the cluster. 17 | 18 | Args: 19 | cur_pg: The current placement group, if any. 20 | """ 21 | cluster_cpus = int(ray.cluster_resources().get("CPU", 1)) 22 | cluster_gpus = int(ray.cluster_resources().get("GPU", 0)) 23 | 24 | # If we're in a placement group, we shouldn't assume the entire cluster's 25 | # resources are available for us to use. Estimate an upper bound on what's 26 | # reasonable to assume is available for datasets to use. 27 | if cur_pg: 28 | pg_cpus = 0 29 | for bundle in cur_pg.bundle_specs: 30 | # Calculate the proportion of the cluster this placement group "takes up". 31 | # Then scale our cluster_cpus proportionally to avoid over-parallelizing 32 | # if there are many parallel Tune trials using the cluster. 33 | cpu_fraction = bundle.get("CPU", 0) / max(1, cluster_cpus) 34 | gpu_fraction = bundle.get("GPU", 0) / max(1, cluster_gpus) 35 | max_fraction = max(cpu_fraction, gpu_fraction) 36 | # Over-parallelize by up to a factor of 2, but no more than that. It's 37 | # preferable to over-estimate than under-estimate. 38 | pg_cpus += 2 * int(max_fraction * cluster_cpus) 39 | 40 | return min(cluster_cpus, pg_cpus) 41 | 42 | return cluster_cpus 43 | 44 | 45 | def _estimate_available_parallelism() -> int: 46 | """ 47 | Estimates the available CPU parallelism for this Dataset in the cluster. 48 | 49 | If we are currently in a placement group, take that into account. 50 | """ 51 | cur_pg = ray.util.get_current_placement_group() 52 | return _estimate_avail_cpus(cur_pg) 53 | 54 | 55 | def ensure_worker_count(use_threads: bool | int = True) -> int: 56 | if type(use_threads) == int: # noqa: E721 57 | if use_threads < 1: 58 | return 1 59 | return use_threads 60 | 61 | if use_threads is False: 62 | return 1 63 | 64 | parallelism = _estimate_available_parallelism() 65 | return max(parallelism, 1) 66 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/__init__.py: -------------------------------------------------------------------------------- 1 | """Ray Datasources Module.""" 2 | 3 | from awswrangler.distributed.ray.datasources.arrow_csv_datasink import ArrowCSVDatasink 4 | from awswrangler.distributed.ray.datasources.arrow_csv_datasource import ArrowCSVDatasource 5 | from awswrangler.distributed.ray.datasources.arrow_json_datasource import ArrowJSONDatasource 6 | from awswrangler.distributed.ray.datasources.arrow_orc_datasink import ArrowORCDatasink 7 | from awswrangler.distributed.ray.datasources.arrow_orc_datasource import ArrowORCDatasource 8 | from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource 9 | from awswrangler.distributed.ray.datasources.arrow_parquet_datasink import ArrowParquetDatasink 10 | from awswrangler.distributed.ray.datasources.arrow_parquet_datasource import ArrowParquetDatasource 11 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink 12 | from awswrangler.distributed.ray.datasources.pandas_text_datasink import PandasCSVDatasink, PandasJSONDatasink 13 | from awswrangler.distributed.ray.datasources.pandas_text_datasource import ( 14 | PandasCSVDataSource, 15 | PandasFWFDataSource, 16 | PandasJSONDatasource, 17 | PandasTextDatasource, 18 | ) 19 | 20 | __all__ = [ 21 | "ArrowCSVDatasink", 22 | "ArrowORCDatasink", 23 | "ArrowParquetDatasink", 24 | "ArrowCSVDatasource", 25 | "ArrowJSONDatasource", 26 | "ArrowORCDatasource", 27 | "ArrowParquetBaseDatasource", 28 | "ArrowParquetDatasource", 29 | "PandasCSVDataSource", 30 | "PandasFWFDataSource", 31 | "PandasJSONDatasource", 32 | "PandasTextDatasource", 33 | "PandasCSVDatasink", 34 | "PandasJSONDatasink", 35 | "_BlockFileDatasink", 36 | ] 37 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_csv_datasink.py: -------------------------------------------------------------------------------- 1 | """Ray PandasTextDatasink Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import io 6 | import logging 7 | from typing import Any 8 | 9 | from pyarrow import csv 10 | from ray.data.block import BlockAccessor 11 | from ray.data.datasource.filename_provider import FilenameProvider 12 | 13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink 14 | 15 | _logger: logging.Logger = logging.getLogger(__name__) 16 | 17 | 18 | class ArrowCSVDatasink(_BlockFileDatasink): 19 | """A datasink that writes CSV files using Arrow.""" 20 | 21 | def __init__( 22 | self, 23 | path: str, 24 | *, 25 | filename_provider: FilenameProvider | None = None, 26 | dataset_uuid: str | None = None, 27 | open_s3_object_args: dict[str, Any] | None = None, 28 | pandas_kwargs: dict[str, Any] | None = None, 29 | write_options: dict[str, Any] | None = None, 30 | **write_args: Any, 31 | ): 32 | super().__init__( 33 | path, 34 | file_format="csv", 35 | filename_provider=filename_provider, 36 | dataset_uuid=dataset_uuid, 37 | open_s3_object_args=open_s3_object_args, 38 | pandas_kwargs=pandas_kwargs, 39 | **write_args, 40 | ) 41 | 42 | self.write_options = write_options or {} 43 | 44 | def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None: 45 | """ 46 | Write a block of data to a file. 47 | 48 | Parameters 49 | ---------- 50 | block : BlockAccessor 51 | file : io.TextIOWrapper 52 | """ 53 | csv.write_csv(block.to_arrow(), file, csv.WriteOptions(**self.write_options)) 54 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_csv_datasource.py: -------------------------------------------------------------------------------- 1 | """Ray ArrowCSVDatasource Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any, Iterator 6 | 7 | import pyarrow as pa 8 | from pyarrow import csv 9 | from ray.data.datasource.file_based_datasource import FileBasedDatasource 10 | 11 | from awswrangler._arrow import _add_table_partitions 12 | 13 | 14 | class ArrowCSVDatasource(FileBasedDatasource): 15 | """CSV datasource, for reading CSV files using PyArrow.""" 16 | 17 | _FILE_EXTENSIONS = ["csv"] 18 | 19 | def __init__( 20 | self, 21 | paths: str | list[str], 22 | dataset: bool, 23 | path_root: str, 24 | version_ids: dict[str, str] | None = None, 25 | s3_additional_kwargs: dict[str, str] | None = None, 26 | pandas_kwargs: dict[str, Any] | None = None, 27 | arrow_csv_args: dict[str, Any] | None = None, 28 | **file_based_datasource_kwargs: Any, 29 | ): 30 | from pyarrow import csv 31 | 32 | super().__init__(paths, **file_based_datasource_kwargs) 33 | 34 | self.dataset = dataset 35 | self.path_root = path_root 36 | 37 | if arrow_csv_args is None: 38 | arrow_csv_args = {} 39 | 40 | self.read_options = arrow_csv_args.pop("read_options", csv.ReadOptions(use_threads=False)) 41 | self.parse_options = arrow_csv_args.pop("parse_options", csv.ParseOptions()) 42 | self.convert_options = arrow_csv_args.get("convert_options", csv.ConvertOptions()) 43 | self.arrow_csv_args = arrow_csv_args 44 | 45 | def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]: 46 | reader = csv.open_csv( 47 | f, 48 | read_options=self.read_options, 49 | parse_options=self.parse_options, 50 | convert_options=self.convert_options, 51 | ) 52 | 53 | schema = None 54 | while True: 55 | try: 56 | batch = reader.read_next_batch() 57 | table = pa.Table.from_batches([batch], schema=schema) 58 | if schema is None: 59 | schema = table.schema 60 | 61 | if self.dataset: 62 | table = _add_table_partitions( 63 | table=table, 64 | path=f"s3://{path}", 65 | path_root=self.path_root, 66 | ) 67 | 68 | yield table 69 | 70 | except StopIteration: 71 | return 72 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_json_datasource.py: -------------------------------------------------------------------------------- 1 | """Ray ArrowCSVDatasource Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any, Iterator 6 | 7 | import pyarrow as pa 8 | from pyarrow import json 9 | from ray.data.datasource.file_based_datasource import FileBasedDatasource 10 | 11 | from awswrangler._arrow import _add_table_partitions 12 | 13 | 14 | class ArrowJSONDatasource(FileBasedDatasource): 15 | """JSON datasource, for reading JSON files using PyArrow.""" 16 | 17 | _FILE_EXTENSIONS = ["json"] 18 | 19 | def __init__( 20 | self, 21 | paths: str | list[str], 22 | dataset: bool, 23 | path_root: str, 24 | version_ids: dict[str, str] | None = None, 25 | s3_additional_kwargs: dict[str, str] | None = None, 26 | pandas_kwargs: dict[str, Any] | None = None, 27 | arrow_json_args: dict[str, Any] | None = None, 28 | **file_based_datasource_kwargs: Any, 29 | ): 30 | super().__init__(paths, **file_based_datasource_kwargs) 31 | 32 | self.dataset = dataset 33 | self.path_root = path_root 34 | 35 | if arrow_json_args is None: 36 | arrow_json_args = {} 37 | 38 | self.read_options = json.ReadOptions(arrow_json_args.pop("read_options", dict(use_threads=False))) 39 | self.parse_options = json.ParseOptions(arrow_json_args.pop("parse_options", {})) 40 | self.arrow_json_args = arrow_json_args 41 | 42 | def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]: 43 | table = json.read_json(f, read_options=self.read_options, parse_options=self.parse_options) 44 | 45 | if self.dataset: 46 | table = _add_table_partitions( 47 | table=table, 48 | path=f"s3://{path}", 49 | path_root=self.path_root, 50 | ) 51 | 52 | return [table] # type: ignore[return-value] 53 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_orc_datasink.py: -------------------------------------------------------------------------------- 1 | """Ray PandasTextDatasink Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import io 6 | import logging 7 | from typing import Any 8 | 9 | import pyarrow as pa 10 | from ray.data.block import BlockAccessor 11 | from ray.data.datasource.filename_provider import FilenameProvider 12 | 13 | from awswrangler._arrow import _df_to_table 14 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink 15 | 16 | _logger: logging.Logger = logging.getLogger(__name__) 17 | 18 | 19 | class ArrowORCDatasink(_BlockFileDatasink): 20 | """A datasink that writes CSV files using Arrow.""" 21 | 22 | def __init__( 23 | self, 24 | path: str, 25 | *, 26 | filename_provider: FilenameProvider | None = None, 27 | dataset_uuid: str | None = None, 28 | open_s3_object_args: dict[str, Any] | None = None, 29 | pandas_kwargs: dict[str, Any] | None = None, 30 | schema: pa.Schema | None = None, 31 | index: bool = False, 32 | dtype: dict[str, str] | None = None, 33 | pyarrow_additional_kwargs: dict[str, Any] | None = None, 34 | **write_args: Any, 35 | ): 36 | super().__init__( 37 | path, 38 | file_format="orc", 39 | filename_provider=filename_provider, 40 | dataset_uuid=dataset_uuid, 41 | open_s3_object_args=open_s3_object_args, 42 | pandas_kwargs=pandas_kwargs, 43 | **write_args, 44 | ) 45 | 46 | self.pyarrow_additional_kwargs = pyarrow_additional_kwargs or {} 47 | self.schema = schema 48 | self.index = index 49 | self.dtype = dtype 50 | 51 | def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None: 52 | """ 53 | Write a block of data to a file. 54 | 55 | Parameters 56 | ---------- 57 | file : io.TextIOWrapper 58 | block : BlockAccessor 59 | """ 60 | from pyarrow import orc 61 | 62 | compression: str = self.write_args.get("compression", None) or "UNCOMPRESSED" 63 | 64 | orc.write_table( 65 | _df_to_table(block.to_pandas(), schema=self.schema, index=self.index, dtype=self.dtype), 66 | file, 67 | compression=compression, 68 | **self.pyarrow_additional_kwargs, 69 | ) 70 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_orc_datasource.py: -------------------------------------------------------------------------------- 1 | """Ray ArrowCSVDatasource Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any, Iterator 6 | 7 | import pyarrow as pa 8 | from ray.data.datasource.file_based_datasource import FileBasedDatasource 9 | 10 | from awswrangler._arrow import _add_table_partitions 11 | 12 | 13 | class ArrowORCDatasource(FileBasedDatasource): 14 | """ORC datasource, for reading and writing ORC files using PyArrow.""" 15 | 16 | _FILE_EXTENSIONS = ["orc"] 17 | 18 | def __init__( 19 | self, 20 | paths: str | list[str], 21 | dataset: bool, 22 | path_root: str | None, 23 | use_threads: bool | int, 24 | schema: pa.Schema, 25 | arrow_orc_args: dict[str, Any] | None = None, 26 | **file_based_datasource_kwargs: Any, 27 | ): 28 | super().__init__(paths, **file_based_datasource_kwargs) 29 | 30 | self.dataset = dataset 31 | self.path_root = path_root 32 | 33 | if arrow_orc_args is None: 34 | arrow_orc_args = {} 35 | 36 | self.columns: list[str] | None = arrow_orc_args.get("columns", None) 37 | self.arrow_orc_args = arrow_orc_args 38 | 39 | def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]: 40 | from pyarrow import orc 41 | 42 | table: pa.Table = orc.read_table(f, columns=self.columns) 43 | 44 | if self.dataset: 45 | table = _add_table_partitions( 46 | table=table, 47 | path=f"s3://{path}", 48 | path_root=self.path_root, 49 | ) 50 | 51 | return [table] # type: ignore[return-value] 52 | 53 | def _open_input_source( 54 | self, 55 | filesystem: pa.fs.FileSystem, 56 | path: str, 57 | **open_args: Any, 58 | ) -> pa.NativeFile: 59 | return filesystem.open_input_file(path, **open_args) 60 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py: -------------------------------------------------------------------------------- 1 | """Ray ParquetBaseDatasource Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any, Iterator 6 | 7 | # fs required to implicitly trigger S3 subsystem initialization 8 | import pyarrow as pa 9 | import pyarrow.fs 10 | import pyarrow.parquet as pq 11 | from ray.data.datasource.file_based_datasource import FileBasedDatasource 12 | 13 | from awswrangler._arrow import _add_table_partitions 14 | 15 | 16 | class ArrowParquetBaseDatasource(FileBasedDatasource): 17 | """Parquet datasource, for reading Parquet files.""" 18 | 19 | _FILE_EXTENSIONS = ["parquet"] 20 | 21 | def __init__( 22 | self, 23 | paths: str | list[str], 24 | path_root: str, 25 | arrow_parquet_args: dict[str, Any] | None = None, 26 | **file_based_datasource_kwargs: Any, 27 | ): 28 | super().__init__(paths, **file_based_datasource_kwargs) 29 | 30 | if arrow_parquet_args is None: 31 | arrow_parquet_args = {} 32 | 33 | self.path_root = path_root 34 | self.arrow_parquet_args = arrow_parquet_args 35 | 36 | def _read_stream(self, f: pa.NativeFile, path: str) -> Iterator[pa.Table]: 37 | arrow_parquet_args = self.arrow_parquet_args 38 | 39 | use_threads: bool = arrow_parquet_args.get("use_threads", False) 40 | columns: list[str] | None = arrow_parquet_args.get("columns", None) 41 | 42 | dataset_kwargs = arrow_parquet_args.get("dataset_kwargs", {}) 43 | coerce_int96_timestamp_unit: str | None = dataset_kwargs.get("coerce_int96_timestamp_unit", None) 44 | decryption_properties = dataset_kwargs.get("decryption_properties", None) 45 | 46 | table = pq.read_table( 47 | f, 48 | use_threads=use_threads, 49 | columns=columns, 50 | coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, 51 | decryption_properties=decryption_properties, 52 | ) 53 | 54 | table = _add_table_partitions( 55 | table=table, 56 | path=f"s3://{path}", 57 | path_root=self.path_root, 58 | ) 59 | 60 | return [table] # type: ignore[return-value] 61 | 62 | def _open_input_source( 63 | self, 64 | filesystem: pyarrow.fs.FileSystem, 65 | path: str, 66 | **open_args: Any, 67 | ) -> pa.NativeFile: 68 | # Parquet requires `open_input_file` due to random access reads 69 | return filesystem.open_input_file(path, **open_args) 70 | 71 | def get_name(self) -> str: 72 | """Return a human-readable name for this datasource.""" 73 | return "ParquetBulk" 74 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/arrow_parquet_datasink.py: -------------------------------------------------------------------------------- 1 | """Ray ArrowParquetDatasink Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any 7 | 8 | import pyarrow as pa 9 | from ray.data.block import BlockAccessor 10 | from ray.data.datasource.filename_provider import FilenameProvider 11 | 12 | from awswrangler._arrow import _df_to_table 13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink 14 | from awswrangler.distributed.ray.datasources.filename_provider import _DefaultFilenameProvider 15 | from awswrangler.s3._write import _COMPRESSION_2_EXT 16 | 17 | _logger: logging.Logger = logging.getLogger(__name__) 18 | 19 | 20 | class _ParquetFilenameProvider(_DefaultFilenameProvider): 21 | """Parquet filename provider where compression comes before file format.""" 22 | 23 | def _generate_filename(self, file_id: str) -> str: 24 | filename = "" 25 | if self._dataset_uuid is not None: 26 | filename += f"{self._dataset_uuid}_" 27 | filename += f"{file_id}" 28 | if self._bucket_id is not None: 29 | filename += f"_bucket-{self._bucket_id:05d}" 30 | filename += f"{_COMPRESSION_2_EXT.get(self._compression)}.{self._file_format}" 31 | return filename 32 | 33 | 34 | class ArrowParquetDatasink(_BlockFileDatasink): 35 | """A datasink that writes Parquet files.""" 36 | 37 | def __init__( 38 | self, 39 | path: str, 40 | *, 41 | filename_provider: FilenameProvider | None = None, 42 | dataset_uuid: str | None = None, 43 | open_s3_object_args: dict[str, Any] | None = None, 44 | pandas_kwargs: dict[str, Any] | None = None, 45 | schema: pa.Schema | None = None, 46 | index: bool = False, 47 | dtype: dict[str, str] | None = None, 48 | pyarrow_additional_kwargs: dict[str, Any] | None = None, 49 | compression: str | None = None, 50 | **write_args: Any, 51 | ): 52 | file_format = "parquet" 53 | write_args = write_args or {} 54 | 55 | if filename_provider is None: 56 | bucket_id = write_args.get("bucket_id", None) 57 | 58 | filename_provider = _ParquetFilenameProvider( 59 | dataset_uuid=dataset_uuid, 60 | file_format=file_format, 61 | compression=compression, 62 | bucket_id=bucket_id, 63 | ) 64 | 65 | super().__init__( 66 | path, 67 | file_format=file_format, 68 | filename_provider=filename_provider, 69 | dataset_uuid=dataset_uuid, 70 | open_s3_object_args=open_s3_object_args, 71 | pandas_kwargs=pandas_kwargs, 72 | **write_args, 73 | ) 74 | 75 | self.pyarrow_additional_kwargs = pyarrow_additional_kwargs or {} 76 | self.schema = schema 77 | self.index = index 78 | self.dtype = dtype 79 | 80 | def write_block(self, file: pa.NativeFile, block: BlockAccessor) -> None: 81 | """ 82 | Write a block of data to a file. 83 | 84 | Parameters 85 | ---------- 86 | file : pa.NativeFile 87 | block : BlockAccessor 88 | """ 89 | pa.parquet.write_table( 90 | _df_to_table(block.to_pandas(), schema=self.schema, index=self.index, dtype=self.dtype), 91 | file, 92 | **self.pyarrow_additional_kwargs, 93 | ) 94 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/filename_provider.py: -------------------------------------------------------------------------------- 1 | """Ray DefaultFilenameProvider Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any 6 | 7 | from ray.data.block import Block 8 | from ray.data.datasource.filename_provider import FilenameProvider 9 | 10 | from awswrangler.s3._write import _COMPRESSION_2_EXT 11 | 12 | 13 | class _DefaultFilenameProvider(FilenameProvider): 14 | def __init__( 15 | self, 16 | file_format: str, 17 | dataset_uuid: str | None = None, 18 | compression: str | None = None, 19 | bucket_id: int | None = None, 20 | ): 21 | self._dataset_uuid = dataset_uuid 22 | self._file_format = file_format 23 | self._compression = compression 24 | self._bucket_id = bucket_id 25 | 26 | def get_filename_for_block( 27 | self, 28 | block: Block, 29 | task_index: int, 30 | block_index: int, 31 | ) -> str: 32 | file_id = f"{task_index:06}_{block_index:06}" 33 | return self._generate_filename(file_id) 34 | 35 | def get_filename_for_row(self, row: dict[str, Any], task_index: int, block_index: int, row_index: int) -> str: 36 | file_id = f"{task_index:06}_{block_index:06}_{row_index:06}" 37 | return self._generate_filename(file_id) 38 | 39 | def _generate_filename(self, file_id: str) -> str: 40 | filename = "" 41 | if self._dataset_uuid is not None: 42 | filename += f"{self._dataset_uuid}_" 43 | filename += f"{file_id}" 44 | if self._bucket_id is not None: 45 | filename += f"_bucket-{self._bucket_id:05d}" 46 | filename += f".{self._file_format}{_COMPRESSION_2_EXT.get(self._compression)}" 47 | return filename 48 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/datasources/pandas_text_datasink.py: -------------------------------------------------------------------------------- 1 | """Ray PandasTextDatasink Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import io 6 | import logging 7 | from typing import Any, Callable 8 | 9 | import pandas as pd 10 | from ray.data.block import BlockAccessor 11 | from ray.data.datasource.filename_provider import FilenameProvider 12 | 13 | from awswrangler.distributed.ray.datasources.file_datasink import _BlockFileDatasink 14 | 15 | _logger: logging.Logger = logging.getLogger(__name__) 16 | 17 | 18 | class _PandasTextDatasink(_BlockFileDatasink): 19 | """A datasink that writes text files using Pandas IO.""" 20 | 21 | def __init__( 22 | self, 23 | path: str, 24 | file_format: str, 25 | write_text_func: Callable[..., None] | None, 26 | *, 27 | filename_provider: FilenameProvider | None = None, 28 | dataset_uuid: str | None = None, 29 | open_s3_object_args: dict[str, Any] | None = None, 30 | pandas_kwargs: dict[str, Any] | None = None, 31 | **write_args: Any, 32 | ): 33 | super().__init__( 34 | path, 35 | file_format=file_format, 36 | filename_provider=filename_provider, 37 | dataset_uuid=dataset_uuid, 38 | open_s3_object_args=open_s3_object_args, 39 | pandas_kwargs=pandas_kwargs, 40 | **write_args, 41 | ) 42 | 43 | self.write_text_func = write_text_func 44 | 45 | def write_block(self, file: io.TextIOWrapper, block: BlockAccessor) -> None: 46 | """ 47 | Write a block of data to a file. 48 | 49 | Parameters 50 | ---------- 51 | block : BlockAccessor 52 | file : pa.NativeFile 53 | """ 54 | write_text_func = self.write_text_func 55 | 56 | write_text_func(block.to_pandas(), file, **self.pandas_kwargs) # type: ignore[misc] 57 | 58 | 59 | class PandasCSVDatasink(_PandasTextDatasink): 60 | """A datasink that writes CSV files using Pandas IO.""" 61 | 62 | def __init__( 63 | self, 64 | path: str, 65 | *, 66 | filename_provider: FilenameProvider | None = None, 67 | dataset_uuid: str | None = None, 68 | open_s3_object_args: dict[str, Any] | None = None, 69 | pandas_kwargs: dict[str, Any] | None = None, 70 | **write_args: Any, 71 | ): 72 | super().__init__( 73 | path, 74 | "csv", 75 | pd.DataFrame.to_csv, 76 | filename_provider=filename_provider, 77 | dataset_uuid=dataset_uuid, 78 | open_s3_object_args=open_s3_object_args, 79 | pandas_kwargs=pandas_kwargs, 80 | **write_args, 81 | ) 82 | 83 | 84 | class PandasJSONDatasink(_PandasTextDatasink): 85 | """A datasink that writes CSV files using Pandas IO.""" 86 | 87 | def __init__( 88 | self, 89 | path: str, 90 | *, 91 | filename_provider: FilenameProvider | None = None, 92 | dataset_uuid: str | None = None, 93 | open_s3_object_args: dict[str, Any] | None = None, 94 | pandas_kwargs: dict[str, Any] | None = None, 95 | **write_args: Any, 96 | ): 97 | super().__init__( 98 | path, 99 | "json", 100 | pd.DataFrame.to_json, 101 | filename_provider=filename_provider, 102 | dataset_uuid=dataset_uuid, 103 | open_s3_object_args=open_s3_object_args, 104 | pandas_kwargs=pandas_kwargs, 105 | **write_args, 106 | ) 107 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/__init__.py: -------------------------------------------------------------------------------- 1 | """Ray Modin Module.""" 2 | 3 | from awswrangler.distributed.ray.modin._core import modin_repartition 4 | 5 | __all__ = [ 6 | "modin_repartition", 7 | ] 8 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/_core.py: -------------------------------------------------------------------------------- 1 | """Modin on Ray Core module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from functools import wraps 7 | from typing import Any, Callable, TypeVar 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from modin.distributed.dataframe.pandas import from_partitions, unwrap_partitions 12 | from modin.pandas import DataFrame as ModinDataFrame 13 | 14 | _logger: logging.Logger = logging.getLogger(__name__) 15 | 16 | 17 | def _validate_partition_shape(df: pd.DataFrame) -> bool: 18 | """ 19 | Validate if partitions of the data frame are partitioned along row axis. 20 | 21 | Parameters 22 | ---------- 23 | df : pd.DataFrame 24 | Modin data frame 25 | 26 | Returns 27 | ------- 28 | bool 29 | """ 30 | # Unwrap partitions as they are currently stored (axis=None) 31 | partitions_shape = np.array(unwrap_partitions(df)).shape 32 | return partitions_shape[1] == 1 # type: ignore[no-any-return,unused-ignore] 33 | 34 | 35 | FunctionType = TypeVar("FunctionType", bound=Callable[..., Any]) 36 | 37 | 38 | def modin_repartition(function: FunctionType) -> FunctionType: 39 | """ 40 | Decorate callable to repartition Modin data frame. 41 | 42 | By default, repartition along row (axis=0) axis. 43 | This avoids a situation where columns are split along multiple blocks. 44 | 45 | Parameters 46 | ---------- 47 | function : Callable[..., Any] 48 | Callable as input to ray.remote 49 | 50 | Returns 51 | ------- 52 | Callable[..., Any] 53 | """ 54 | # Access the source function if it exists 55 | function = getattr(function, "_source_func", function) 56 | 57 | @wraps(function) 58 | def wrapper( 59 | df: pd.DataFrame, 60 | *args: Any, 61 | axis: int | None = None, 62 | row_lengths: int | None = None, 63 | validate_partitions: bool = True, 64 | **kwargs: Any, 65 | ) -> Any: 66 | # Validate partitions and repartition Modin data frame along row (axis=0) axis 67 | # to avoid a situation where columns are split along multiple blocks 68 | if isinstance(df, ModinDataFrame): 69 | if validate_partitions and not _validate_partition_shape(df): 70 | _logger.warning( 71 | "Partitions of this data frame are detected to be split along column axis. " 72 | "The DataFrame will be automatically repartitioned along row axis to ensure " 73 | "each partition can be processed independently." 74 | ) 75 | axis = 0 76 | if axis is not None: 77 | df = from_partitions(unwrap_partitions(df, axis=axis), axis=axis, row_lengths=row_lengths) 78 | return function(df, *args, **kwargs) 79 | 80 | return wrapper # type: ignore[return-value] 81 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/_data_types.py: -------------------------------------------------------------------------------- 1 | """Internal (private) Data Types Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import modin.pandas as pd 6 | import pyarrow as pa 7 | 8 | from awswrangler._data_types import pyarrow_types_from_pandas 9 | from awswrangler.distributed.ray import ray_get, ray_remote 10 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df 11 | 12 | 13 | def pyarrow_types_from_pandas_distributed( 14 | df: pd.DataFrame, index: bool, ignore_cols: list[str] | None = None, index_left: bool = False 15 | ) -> dict[str, pa.DataType]: 16 | """Extract the related Pyarrow data types from a pandas DataFrame.""" 17 | func = ray_remote()(pyarrow_types_from_pandas) 18 | first_block_object_ref = next(_ray_dataset_from_df(df).iter_internal_ref_bundles()).block_refs[0] 19 | return ray_get( # type: ignore[no-any-return] 20 | func( 21 | df=first_block_object_ref, 22 | index=index, 23 | ignore_cols=ignore_cols, 24 | index_left=index_left, 25 | ) 26 | ) 27 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/s3/__init__.py: -------------------------------------------------------------------------------- 1 | """Ray Modin S3 Module.""" 2 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/s3/_read_orc.py: -------------------------------------------------------------------------------- 1 | """Modin on Ray S3 read text module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import TYPE_CHECKING, Any 7 | 8 | import modin.pandas as pd 9 | import pyarrow as pa 10 | from ray.data import read_datasource 11 | from ray.data.datasource import FastFileMetadataProvider 12 | 13 | from awswrangler import _data_types 14 | from awswrangler.distributed.ray.datasources import ArrowORCDatasource 15 | from awswrangler.distributed.ray.modin._utils import _to_modin 16 | 17 | if TYPE_CHECKING: 18 | from mypy_boto3_s3 import S3Client 19 | 20 | _logger: logging.Logger = logging.getLogger(__name__) 21 | 22 | 23 | def _read_orc_distributed( 24 | paths: list[str], 25 | path_root: str | None, 26 | schema: pa.schema | None, 27 | columns: list[str] | None, 28 | use_threads: bool | int, 29 | override_num_blocks: int, 30 | version_ids: dict[str, str] | None, 31 | s3_client: "S3Client" | None, 32 | s3_additional_kwargs: dict[str, Any] | None, 33 | arrow_kwargs: dict[str, Any], 34 | ) -> pd.DataFrame: 35 | datasource = ArrowORCDatasource( 36 | paths=paths, 37 | dataset=True, 38 | path_root=path_root, 39 | use_threads=use_threads, 40 | schema=schema, 41 | arrow_orc_args={"columns": columns}, 42 | meta_provider=FastFileMetadataProvider(), 43 | ) 44 | ray_dataset = read_datasource( 45 | datasource, 46 | override_num_blocks=override_num_blocks, 47 | ) 48 | to_pandas_kwargs = _data_types.pyarrow2pandas_defaults( 49 | use_threads=use_threads, 50 | kwargs=arrow_kwargs, 51 | ) 52 | return _to_modin(dataset=ray_dataset, to_pandas_kwargs=to_pandas_kwargs) 53 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/s3/_read_parquet.py: -------------------------------------------------------------------------------- 1 | """Modin on Ray S3 read parquet module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING, Any 6 | 7 | import modin.pandas as pd 8 | import pyarrow as pa 9 | from ray.data import read_datasource 10 | from ray.data.datasource import FastFileMetadataProvider 11 | 12 | from awswrangler.distributed.ray.datasources import ArrowParquetBaseDatasource, ArrowParquetDatasource 13 | from awswrangler.distributed.ray.modin._utils import _to_modin 14 | 15 | if TYPE_CHECKING: 16 | from mypy_boto3_s3 import S3Client 17 | 18 | 19 | def _resolve_datasource_parameters(bulk_read: bool, *args: Any, **kwargs: Any) -> dict[str, Any]: 20 | if bulk_read: 21 | return { 22 | "datasource": ArrowParquetBaseDatasource(*args, **kwargs), 23 | "meta_provider": FastFileMetadataProvider(), 24 | } 25 | return { 26 | "datasource": ArrowParquetDatasource(*args, **kwargs), 27 | } 28 | 29 | 30 | def _read_parquet_distributed( 31 | paths: list[str], 32 | path_root: str | None, 33 | schema: pa.schema | None, 34 | columns: list[str] | None, 35 | coerce_int96_timestamp_unit: str | None, 36 | use_threads: bool | int, 37 | override_num_blocks: int, 38 | version_ids: dict[str, str] | None, 39 | s3_client: "S3Client" | None, 40 | s3_additional_kwargs: dict[str, Any] | None, 41 | arrow_kwargs: dict[str, Any], 42 | bulk_read: bool, 43 | decryption_properties: pa.parquet.encryption.DecryptionConfiguration | None = None, 44 | ) -> pd.DataFrame: 45 | dataset_kwargs = {} 46 | if coerce_int96_timestamp_unit: 47 | dataset_kwargs["coerce_int96_timestamp_unit"] = coerce_int96_timestamp_unit 48 | if decryption_properties: 49 | dataset_kwargs["decryption_properties"] = decryption_properties 50 | 51 | dataset = read_datasource( 52 | **_resolve_datasource_parameters( 53 | bulk_read, 54 | paths=paths, 55 | path_root=path_root, 56 | arrow_parquet_args={ 57 | "use_threads": use_threads, 58 | "schema": schema, 59 | "columns": columns, 60 | "dataset_kwargs": dataset_kwargs, 61 | }, 62 | ), 63 | override_num_blocks=override_num_blocks, 64 | ) 65 | return _to_modin( 66 | dataset=dataset, 67 | to_pandas_kwargs=arrow_kwargs, 68 | ignore_index=arrow_kwargs.get("ignore_metadata"), 69 | ) 70 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/s3/_write_orc.py: -------------------------------------------------------------------------------- 1 | """Modin on Ray S3 write parquet module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import math 7 | from typing import TYPE_CHECKING, Any, cast 8 | 9 | import modin.pandas as pd 10 | import pyarrow as pa 11 | 12 | from awswrangler import exceptions 13 | from awswrangler.distributed.ray.datasources import ArrowORCDatasink 14 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df 15 | from awswrangler.typing import ArrowEncryptionConfiguration 16 | 17 | if TYPE_CHECKING: 18 | from mypy_boto3_s3 import S3Client 19 | 20 | _logger: logging.Logger = logging.getLogger(__name__) 21 | 22 | 23 | def _to_orc_distributed( 24 | df: pd.DataFrame, 25 | schema: pa.Schema, 26 | index: bool, 27 | compression: str | None, 28 | compression_ext: str, 29 | pyarrow_additional_kwargs: dict[str, Any], 30 | cpus: int, 31 | dtype: dict[str, str], 32 | s3_client: "S3Client" | None, 33 | s3_additional_kwargs: dict[str, str] | None, 34 | use_threads: bool | int, 35 | path: str | None = None, 36 | path_root: str | None = None, 37 | filename_prefix: str | None = None, 38 | max_rows_by_file: int | None = 0, 39 | bucketing: bool = False, 40 | encryption_configuration: ArrowEncryptionConfiguration | None = None, 41 | ) -> list[str]: 42 | # Create Ray Dataset 43 | ds = _ray_dataset_from_df(df) 44 | 45 | if df.index.name is not None: 46 | raise exceptions.InvalidArgumentCombination("Orc does not serialize index metadata on a default index.") 47 | 48 | # Repartition into a single block if or writing into a single key or if bucketing is enabled 49 | if ds.count() > 0 and (path or bucketing) and not max_rows_by_file: 50 | _logger.warning( 51 | "Repartitioning frame to single partition as a strict path was defined: %s. " 52 | "This operation is inefficient for large datasets.", 53 | path, 54 | ) 55 | ds = ds.repartition(1) 56 | 57 | # Repartition by max_rows_by_file 58 | elif max_rows_by_file and (max_rows_by_file > 0): 59 | ds = ds.repartition(math.ceil(ds.count() / max_rows_by_file)) 60 | 61 | if path and not path.endswith("/"): 62 | path = f"{path}/" 63 | 64 | datasink = ArrowORCDatasink( 65 | path=cast(str, path or path_root), 66 | dataset_uuid=filename_prefix, 67 | open_s3_object_args={ 68 | "s3_additional_kwargs": s3_additional_kwargs, 69 | }, 70 | index=index, 71 | dtype=dtype, 72 | compression=compression, 73 | pyarrow_additional_kwargs=pyarrow_additional_kwargs, 74 | schema=schema, 75 | bucket_id=df.name if bucketing else None, 76 | ) 77 | ds.write_datasink(datasink) 78 | return datasink.get_write_paths() 79 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/modin/s3/_write_parquet.py: -------------------------------------------------------------------------------- 1 | """Modin on Ray S3 write parquet module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import math 7 | from typing import TYPE_CHECKING, Any, cast 8 | 9 | import modin.pandas as pd 10 | import pyarrow as pa 11 | 12 | from awswrangler import exceptions 13 | from awswrangler.distributed.ray.datasources import ArrowParquetDatasink 14 | from awswrangler.distributed.ray.modin._utils import _ray_dataset_from_df 15 | from awswrangler.typing import ArrowEncryptionConfiguration 16 | 17 | if TYPE_CHECKING: 18 | from mypy_boto3_s3 import S3Client 19 | 20 | _logger: logging.Logger = logging.getLogger(__name__) 21 | 22 | 23 | def _to_parquet_distributed( 24 | df: pd.DataFrame, 25 | schema: "pa.Schema", 26 | index: bool, 27 | compression: str | None, 28 | compression_ext: str, 29 | pyarrow_additional_kwargs: dict[str, Any] | None, 30 | cpus: int, 31 | dtype: dict[str, str], 32 | s3_client: "S3Client" | None, 33 | s3_additional_kwargs: dict[str, str] | None, 34 | use_threads: bool | int, 35 | path: str | None = None, 36 | path_root: str | None = None, 37 | filename_prefix: str | None = "", 38 | max_rows_by_file: int | None = 0, 39 | bucketing: bool = False, 40 | encryption_configuration: ArrowEncryptionConfiguration | None = None, 41 | ) -> list[str]: 42 | # Create Ray Dataset 43 | ds = _ray_dataset_from_df(df) 44 | # Repartition into a single block if or writing into a single key or if bucketing is enabled 45 | if ds.count() > 0 and (path or bucketing) and not max_rows_by_file: 46 | _logger.warning( 47 | "Repartitioning frame to single partition as a strict path was defined: %s. " 48 | "This operation is inefficient for large datasets.", 49 | path, 50 | ) 51 | 52 | if index and df.index.name: 53 | raise exceptions.InvalidArgumentCombination( 54 | "Cannot write a named index when repartitioning to a single file" 55 | ) 56 | 57 | ds = ds.repartition(1) 58 | # Repartition by max_rows_by_file 59 | elif max_rows_by_file and (max_rows_by_file > 0): 60 | if index: 61 | raise exceptions.InvalidArgumentCombination( 62 | "Cannot write indexed file when `max_rows_by_file` is specified" 63 | ) 64 | ds = ds.repartition(math.ceil(ds.count() / max_rows_by_file)) 65 | 66 | if path and not path.endswith("/"): 67 | path = f"{path}/" 68 | 69 | datasink = ArrowParquetDatasink( 70 | path=cast(str, path or path_root), 71 | dataset_uuid=filename_prefix, 72 | index=index, 73 | dtype=dtype, 74 | compression=compression, 75 | pyarrow_additional_kwargs=pyarrow_additional_kwargs, 76 | schema=schema, 77 | bucket_id=df.name if bucketing else None, 78 | ) 79 | ds.write_datasink(datasink) 80 | return datasink.get_write_paths() 81 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/s3/__init__.py: -------------------------------------------------------------------------------- 1 | """Ray S3 Module.""" 2 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/s3/_list.py: -------------------------------------------------------------------------------- 1 | """Ray S3 List module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import datetime 6 | import fnmatch 7 | import logging 8 | from typing import TYPE_CHECKING, Any, Iterator 9 | 10 | from pyarrow.fs import FileSelector, FileType, _resolve_filesystem_and_path 11 | 12 | if TYPE_CHECKING: 13 | from mypy_boto3_s3 import S3Client 14 | 15 | _logger: logging.Logger = logging.getLogger(__name__) 16 | 17 | 18 | def _list_objects_s3fs( 19 | bucket: str, 20 | pattern: str, 21 | prefix: str, 22 | s3_client: "S3Client", 23 | delimiter: str | None, 24 | s3_additional_kwargs: dict[str, Any] | None, 25 | suffix: list[str] | None, 26 | ignore_suffix: list[str] | None, 27 | last_modified_begin: datetime.datetime | None, 28 | last_modified_end: datetime.datetime | None, 29 | ignore_empty: bool, 30 | ) -> Iterator[list[str]]: 31 | """Expand the provided S3 directory path to a list of object paths.""" 32 | resolved_filesystem, resolved_path = _resolve_filesystem_and_path(f"s3://{bucket}/{prefix}", None) 33 | paths: list[str] = [] 34 | 35 | path_info = resolved_filesystem.get_file_info(resolved_path) 36 | 37 | if path_info.type in (FileType.File, FileType.Directory): 38 | if path_info.type == FileType.File: 39 | files = [path_info] 40 | base_path = resolved_path 41 | else: 42 | selector = FileSelector(resolved_path, recursive=True) 43 | files = resolved_filesystem.get_file_info(selector) 44 | base_path = selector.base_dir 45 | 46 | for file_ in files: 47 | if not file_.is_file: 48 | continue 49 | if ignore_empty and file_.size == 0: 50 | continue 51 | file_path = file_.path 52 | if not file_path.startswith(base_path): 53 | continue 54 | if (ignore_suffix is not None) and file_path.endswith(tuple(ignore_suffix)): 55 | continue 56 | if (suffix is None) or file_path.endswith(tuple(suffix)): 57 | if last_modified_begin is not None: 58 | if file_.mtime < last_modified_begin: 59 | continue 60 | if last_modified_end is not None: 61 | if file_.mtime > last_modified_end: 62 | continue 63 | paths.append(f"s3://{file_path}") 64 | 65 | if prefix != pattern: 66 | paths = fnmatch.filter(paths, f"s3://{bucket}/{pattern}") 67 | 68 | if paths: 69 | yield paths 70 | paths = [] 71 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/s3/_read_orc.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import pyarrow as pa 6 | from pyarrow.fs import _resolve_filesystem_and_path 7 | 8 | from awswrangler import _utils 9 | from awswrangler.s3._read_orc import _pyarrow_orc_file_wrapper 10 | 11 | if TYPE_CHECKING: 12 | from mypy_boto3_s3 import S3Client 13 | 14 | 15 | @_utils.retry(ex=OSError) 16 | def _read_orc_metadata_file_distributed( 17 | s3_client: "S3Client" | None, 18 | path: str, 19 | s3_additional_kwargs: dict[str, str] | None, 20 | use_threads: bool | int, 21 | version_id: str | None = None, 22 | ) -> pa.schema | None: 23 | resolved_filesystem, resolved_path = _resolve_filesystem_and_path(path) 24 | 25 | with resolved_filesystem.open_input_file(resolved_path) as f: 26 | orc_file = _pyarrow_orc_file_wrapper( 27 | source=f, 28 | ) 29 | 30 | if orc_file: 31 | return orc_file.schema 32 | 33 | return None 34 | -------------------------------------------------------------------------------- /awswrangler/distributed/ray/s3/_read_parquet.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import pyarrow as pa 6 | from pyarrow.fs import _resolve_filesystem_and_path 7 | 8 | from awswrangler import _utils 9 | from awswrangler.s3._read_parquet import _pyarrow_parquet_file_wrapper 10 | 11 | if TYPE_CHECKING: 12 | from mypy_boto3_s3 import S3Client 13 | 14 | 15 | @_utils.retry(ex=OSError) 16 | def _read_parquet_metadata_file_distributed( 17 | s3_client: "S3Client" | None, 18 | path: str, 19 | s3_additional_kwargs: dict[str, str] | None, 20 | use_threads: bool | int, 21 | version_id: str | None = None, 22 | coerce_int96_timestamp_unit: str | None = None, 23 | ) -> pa.schema | None: 24 | resolved_filesystem, resolved_path = _resolve_filesystem_and_path(path) 25 | 26 | with resolved_filesystem.open_input_file(resolved_path) as f: 27 | pq_file = _pyarrow_parquet_file_wrapper( 28 | source=f, 29 | coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, 30 | ) 31 | 32 | if pq_file: 33 | return pq_file.schema.to_arrow_schema() 34 | 35 | return None 36 | -------------------------------------------------------------------------------- /awswrangler/dynamodb/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon DynamoDB Module.""" 2 | 3 | from awswrangler.dynamodb._delete import delete_items 4 | from awswrangler.dynamodb._read import read_items, read_partiql_query 5 | from awswrangler.dynamodb._utils import execute_statement, get_table 6 | from awswrangler.dynamodb._write import put_csv, put_df, put_items, put_json 7 | 8 | __all__ = [ 9 | "delete_items", 10 | "execute_statement", 11 | "get_table", 12 | "put_csv", 13 | "put_df", 14 | "put_items", 15 | "put_json", 16 | "read_partiql_query", 17 | "read_items", 18 | ] 19 | -------------------------------------------------------------------------------- /awswrangler/dynamodb/_delete.py: -------------------------------------------------------------------------------- 1 | """Amazon DynamoDB Delete Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any 7 | 8 | import boto3 9 | from boto3.dynamodb.types import TypeSerializer 10 | 11 | from awswrangler import _utils 12 | from awswrangler._config import apply_configs 13 | 14 | from ._utils import _TableBatchWriter, _validate_items 15 | 16 | _logger: logging.Logger = logging.getLogger(__name__) 17 | 18 | 19 | @apply_configs 20 | def delete_items( 21 | items: list[dict[str, Any]], 22 | table_name: str, 23 | boto3_session: boto3.Session | None = None, 24 | ) -> None: 25 | """Delete all items in the specified DynamoDB table. 26 | 27 | Parameters 28 | ---------- 29 | items 30 | List which contains the items that will be deleted. 31 | table_name 32 | Name of the Amazon DynamoDB table. 33 | boto3_session 34 | The default boto3 session will be used if **boto3_session** is ``None``. 35 | 36 | Examples 37 | -------- 38 | Writing rows of DataFrame 39 | 40 | >>> import awswrangler as wr 41 | >>> wr.dynamodb.delete_items( 42 | ... items=[{'key': 1}, {'key': 2, 'value': 'Hello'}], 43 | ... table_name='table' 44 | ... ) 45 | """ 46 | _logger.debug("Deleting items from DynamoDB table %s", table_name) 47 | 48 | dynamodb_client = _utils.client(service_name="dynamodb", session=boto3_session) 49 | serializer = TypeSerializer() 50 | 51 | key_schema = dynamodb_client.describe_table(TableName=table_name)["Table"]["KeySchema"] 52 | _validate_items(items=items, key_schema=key_schema) 53 | 54 | table_keys = [schema["AttributeName"] for schema in key_schema] 55 | 56 | with _TableBatchWriter(table_name, dynamodb_client) as writer: 57 | for item in items: 58 | writer.delete_item( 59 | key={key: serializer.serialize(item[key]) for key in table_keys}, 60 | ) 61 | -------------------------------------------------------------------------------- /awswrangler/neptune/__init__.py: -------------------------------------------------------------------------------- 1 | """Utilities Module for Amazon Neptune.""" 2 | 3 | from awswrangler.neptune._client import BulkLoadParserConfiguration 4 | from awswrangler.neptune._gremlin_parser import GremlinParser 5 | from awswrangler.neptune._neptune import ( 6 | bulk_load, 7 | bulk_load_from_files, 8 | connect, 9 | execute_gremlin, 10 | execute_opencypher, 11 | execute_sparql, 12 | flatten_nested_df, 13 | to_property_graph, 14 | to_rdf_graph, 15 | ) 16 | 17 | __all__ = [ 18 | "execute_gremlin", 19 | "execute_opencypher", 20 | "execute_sparql", 21 | "to_property_graph", 22 | "to_rdf_graph", 23 | "connect", 24 | "bulk_load", 25 | "bulk_load_from_files", 26 | "GremlinParser", 27 | "flatten_nested_df", 28 | "BulkLoadParserConfiguration", 29 | ] 30 | -------------------------------------------------------------------------------- /awswrangler/neptune/_gremlin_init.py: -------------------------------------------------------------------------------- 1 | """Gremlin Init Module.""" 2 | # Required because `gremlin_python` does not initialize its modules in __init__.py 3 | 4 | from awswrangler._utils import import_optional_dependency 5 | 6 | if import_optional_dependency("gremlin_python"): 7 | from gremlin_python.driver.client import Client 8 | from gremlin_python.process.anonymous_traversal import traversal 9 | from gremlin_python.process.graph_traversal import GraphTraversalSource, __ 10 | from gremlin_python.process.translator import Translator 11 | from gremlin_python.process.traversal import Cardinality, T 12 | from gremlin_python.structure.graph import Edge, Graph, Path, Property, Vertex, VertexProperty 13 | 14 | __all__ = [ 15 | "__", 16 | "Cardinality", 17 | "Client", 18 | "Edge", 19 | "Graph", 20 | "GraphTraversalSource", 21 | "Path", 22 | "Property", 23 | "T", 24 | "Translator", 25 | "traversal", 26 | "Vertex", 27 | "VertexProperty", 28 | ] 29 | -------------------------------------------------------------------------------- /awswrangler/neptune/_gremlin_parser.py: -------------------------------------------------------------------------------- 1 | # mypy: disable-error-code=name-defined 2 | """Amazon Neptune GremlinParser Module (PRIVATE).""" 3 | 4 | from __future__ import annotations 5 | 6 | from typing import Any 7 | 8 | import awswrangler.neptune._gremlin_init as gremlin 9 | 10 | 11 | class GremlinParser: 12 | """Class representing a parser for returning Gremlin results as a dictionary.""" 13 | 14 | @staticmethod 15 | def gremlin_results_to_dict(result: Any) -> list[dict[str, Any]]: 16 | """Take a Gremlin ResultSet and return a dictionary. 17 | 18 | Parameters 19 | ---------- 20 | result : Any 21 | The Gremlin result set to convert 22 | 23 | Returns 24 | ------- 25 | List[Dict[str, Any]] 26 | A list of dictionary results 27 | """ 28 | res = [] 29 | 30 | # For lists or paths unwind them 31 | if isinstance(result, (list, gremlin.Path)): 32 | for x in result: 33 | res.append(GremlinParser._parse_dict(x)) 34 | 35 | # For dictionaries just add them 36 | elif isinstance(result, dict): 37 | res.append(result) 38 | 39 | # For everything else parse them 40 | else: 41 | res.append(GremlinParser._parse_dict(result)) 42 | return res 43 | 44 | @staticmethod 45 | def _parse_dict(data: Any) -> Any: 46 | d: dict[str, Any] = {} 47 | 48 | # If this is a list or Path then unwind it 49 | if isinstance(data, (list, gremlin.Path)): 50 | res = [] 51 | for x in data: 52 | res.append(GremlinParser._parse_dict(x)) 53 | return res 54 | 55 | # If this is an element then make it a dictionary 56 | if isinstance( 57 | data, 58 | ( 59 | gremlin.Vertex, 60 | gremlin.Edge, 61 | gremlin.VertexProperty, 62 | gremlin.Property, 63 | ), 64 | ): 65 | data = data.__dict__ 66 | 67 | # If this is a scalar then create a Map with it 68 | elif not hasattr(data, "__len__") or isinstance(data, str): 69 | data = {0: data} 70 | 71 | for k, v in data.items(): 72 | # If the key is a Vertex or an Edge do special processing 73 | if isinstance(k, (gremlin.Vertex, gremlin.Edge)): 74 | k = k.id # noqa: PLW2901 75 | 76 | # If the value is a list do special processing to make it a scalar if the list is of length 1 77 | if isinstance(v, list) and len(v) == 1: 78 | d[k] = v[0] 79 | else: 80 | d[k] = v 81 | 82 | # If the value is a Vertex or Edge do special processing 83 | if isinstance( 84 | data, 85 | ( 86 | gremlin.Vertex, 87 | gremlin.Edge, 88 | gremlin.VertexProperty, 89 | gremlin.Property, 90 | ), 91 | ): 92 | d[k] = d[k].__dict__ 93 | return d 94 | -------------------------------------------------------------------------------- /awswrangler/opensearch/__init__.py: -------------------------------------------------------------------------------- 1 | """Utilities Module for Amazon OpenSearch.""" 2 | 3 | from awswrangler.opensearch._read import search, search_by_sql 4 | from awswrangler.opensearch._utils import connect, create_collection 5 | from awswrangler.opensearch._write import create_index, delete_index, index_csv, index_df, index_documents, index_json 6 | 7 | __all__ = [ 8 | "connect", 9 | "create_collection", 10 | "create_index", 11 | "delete_index", 12 | "index_csv", 13 | "index_documents", 14 | "index_df", 15 | "index_json", 16 | "search", 17 | "search_by_sql", 18 | ] 19 | -------------------------------------------------------------------------------- /awswrangler/pandas/__init__.py: -------------------------------------------------------------------------------- 1 | """Pandas "proxy" package.""" 2 | 3 | import logging 4 | from typing import TYPE_CHECKING 5 | 6 | from awswrangler._distributed import MemoryFormatEnum, memory_format 7 | 8 | if TYPE_CHECKING or memory_format.get() == MemoryFormatEnum.PANDAS: 9 | from pandas import * # noqa: F403 10 | 11 | # Explicit import because mypy doesn't support forward references to a star import 12 | from pandas import ( 13 | DataFrame, 14 | Series, 15 | concat, 16 | isna, 17 | isnull, 18 | json_normalize, 19 | notna, 20 | read_csv, 21 | read_excel, 22 | to_datetime, 23 | ) 24 | elif memory_format.get() == MemoryFormatEnum.MODIN: 25 | from modin.pandas import * # noqa: F403 26 | 27 | # Explicit import because mypy doesn't support forward references to a star import 28 | from modin.pandas import ( 29 | DataFrame, 30 | Series, 31 | concat, 32 | isna, 33 | isnull, 34 | json_normalize, 35 | notna, 36 | read_csv, 37 | read_excel, 38 | to_datetime, 39 | ) 40 | else: 41 | raise ImportError(f"Unknown memory format {memory_format}") 42 | 43 | _logger: logging.Logger = logging.getLogger(__name__) 44 | 45 | __all__ = [ 46 | "DataFrame", 47 | "Series", 48 | "concat", 49 | "isna", 50 | "isnull", 51 | "json_normalize", 52 | "notna", 53 | "read_csv", 54 | "read_excel", 55 | "to_datetime", 56 | ] 57 | -------------------------------------------------------------------------------- /awswrangler/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | -------------------------------------------------------------------------------- /awswrangler/quicksight/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon QuickSight Module.""" 2 | 3 | from awswrangler.quicksight._cancel import cancel_ingestion 4 | from awswrangler.quicksight._create import create_athena_data_source, create_athena_dataset, create_ingestion 5 | from awswrangler.quicksight._delete import ( 6 | delete_all_dashboards, 7 | delete_all_data_sources, 8 | delete_all_datasets, 9 | delete_all_templates, 10 | delete_dashboard, 11 | delete_data_source, 12 | delete_dataset, 13 | delete_template, 14 | ) 15 | from awswrangler.quicksight._describe import ( 16 | describe_dashboard, 17 | describe_data_source, 18 | describe_data_source_permissions, 19 | describe_dataset, 20 | describe_ingestion, 21 | ) 22 | from awswrangler.quicksight._get_list import ( 23 | get_dashboard_id, 24 | get_dashboard_ids, 25 | get_data_source_arn, 26 | get_data_source_arns, 27 | get_data_source_id, 28 | get_data_source_ids, 29 | get_dataset_id, 30 | get_dataset_ids, 31 | get_template_id, 32 | get_template_ids, 33 | list_dashboards, 34 | list_data_sources, 35 | list_datasets, 36 | list_group_memberships, 37 | list_groups, 38 | list_iam_policy_assignments, 39 | list_iam_policy_assignments_for_user, 40 | list_ingestions, 41 | list_templates, 42 | list_user_groups, 43 | list_users, 44 | ) 45 | 46 | __all__ = [ 47 | "get_dashboard_id", 48 | "get_dashboard_ids", 49 | "get_data_source_arn", 50 | "get_data_source_arns", 51 | "get_data_source_id", 52 | "get_data_source_ids", 53 | "get_dataset_id", 54 | "get_dataset_ids", 55 | "get_template_id", 56 | "get_template_ids", 57 | "list_dashboards", 58 | "list_data_sources", 59 | "list_datasets", 60 | "list_group_memberships", 61 | "list_groups", 62 | "list_iam_policy_assignments", 63 | "list_iam_policy_assignments_for_user", 64 | "list_ingestions", 65 | "list_templates", 66 | "list_user_groups", 67 | "list_users", 68 | "describe_dashboard", 69 | "describe_data_source", 70 | "describe_data_source_permissions", 71 | "describe_dataset", 72 | "describe_ingestion", 73 | "delete_all_dashboards", 74 | "delete_all_data_sources", 75 | "delete_all_datasets", 76 | "delete_all_templates", 77 | "delete_dashboard", 78 | "delete_data_source", 79 | "delete_dataset", 80 | "delete_template", 81 | "cancel_ingestion", 82 | "create_athena_data_source", 83 | "create_athena_dataset", 84 | "create_ingestion", 85 | ] 86 | -------------------------------------------------------------------------------- /awswrangler/quicksight/_cancel.py: -------------------------------------------------------------------------------- 1 | """Amazon QuickSight Cancel Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import cast 7 | 8 | import boto3 9 | 10 | from awswrangler import _utils, exceptions, sts 11 | from awswrangler.quicksight._get_list import get_dataset_id 12 | 13 | _logger: logging.Logger = logging.getLogger(__name__) 14 | 15 | 16 | def cancel_ingestion( 17 | ingestion_id: str, 18 | dataset_name: str | None = None, 19 | dataset_id: str | None = None, 20 | account_id: str | None = None, 21 | boto3_session: boto3.Session | None = None, 22 | ) -> None: 23 | """Cancel an ongoing ingestion of data into SPICE. 24 | 25 | Note 26 | ---- 27 | You must pass a not None value for ``dataset_name`` or ``dataset_id`` argument. 28 | 29 | Parameters 30 | ---------- 31 | ingestion_id 32 | Ingestion ID. 33 | dataset_name 34 | Dataset name. 35 | dataset_id 36 | Dataset ID. 37 | account_id 38 | If None, the account ID will be inferred from your boto3 session. 39 | boto3_session 40 | The default boto3 session will be used if **boto3_session** is ``None``. 41 | 42 | Examples 43 | -------- 44 | >>> import awswrangler as wr 45 | >>> wr.quicksight.cancel_ingestion(ingestion_id="...", dataset_name="...") 46 | """ 47 | if (dataset_name is None) and (dataset_id is None): 48 | raise exceptions.InvalidArgument("You must pass a not None name or dataset_id argument.") 49 | if account_id is None: 50 | account_id = sts.get_account_id(boto3_session=boto3_session) 51 | if (dataset_id is None) and (dataset_name is not None): 52 | dataset_id = get_dataset_id(name=dataset_name, account_id=account_id, boto3_session=boto3_session) 53 | client = _utils.client(service_name="quicksight", session=boto3_session) 54 | dataset_id = cast(str, dataset_id) 55 | client.cancel_ingestion(IngestionId=ingestion_id, AwsAccountId=account_id, DataSetId=dataset_id) 56 | -------------------------------------------------------------------------------- /awswrangler/quicksight/_utils.py: -------------------------------------------------------------------------------- 1 | """Internal (private) Amazon QuickSight Utilities Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any, TypedDict 7 | 8 | import boto3 9 | from typing_extensions import NotRequired 10 | 11 | from awswrangler import _data_types, athena, catalog, exceptions 12 | from awswrangler.quicksight._get_list import list_data_sources 13 | 14 | _logger: logging.Logger = logging.getLogger(__name__) 15 | 16 | 17 | class _QuicksightPrincipalList(TypedDict): 18 | users: NotRequired[list[str]] 19 | groups: NotRequired[list[str]] 20 | 21 | 22 | def extract_athena_table_columns( 23 | database: str, table: str, boto3_session: boto3.Session | None 24 | ) -> list[dict[str, str]]: 25 | """Extract athena columns data types from table and raising an exception if not exist.""" 26 | dtypes: dict[str, str] | None = catalog.get_table_types(database=database, table=table, boto3_session=boto3_session) 27 | if dtypes is None: 28 | raise exceptions.InvalidArgument(f"{database}.{table} does not exist on Athena.") 29 | return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] 30 | 31 | 32 | def extract_athena_query_columns( 33 | sql: str, data_source_arn: str, account_id: str, boto3_session: boto3.Session | None 34 | ) -> list[dict[str, str]]: 35 | """Extract athena columns data types from a SQL query.""" 36 | data_sources: list[dict[str, Any]] = list_data_sources(account_id=account_id, boto3_session=boto3_session) 37 | data_source: dict[str, Any] = [x for x in data_sources if x["Arn"] == data_source_arn][0] 38 | workgroup: str = data_source["DataSourceParameters"]["AthenaParameters"]["WorkGroup"] 39 | sql_wrapped: str = f"/* QuickSight */\nSELECT ds.* FROM ( {sql} ) ds LIMIT 0" 40 | query_id = athena.start_query_execution(sql=sql_wrapped, workgroup=workgroup, boto3_session=boto3_session) 41 | athena.wait_query(query_execution_id=query_id, boto3_session=boto3_session) 42 | dtypes: dict[str, str] = athena.get_query_columns_types(query_execution_id=query_id, boto3_session=boto3_session) 43 | return [{"Name": name, "Type": _data_types.athena2quicksight(dtype=dtype)} for name, dtype in dtypes.items()] 44 | -------------------------------------------------------------------------------- /awswrangler/redshift/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon Redshift Module.""" 2 | 3 | from awswrangler.redshift._connect import connect, connect_temp 4 | from awswrangler.redshift._read import read_sql_query, read_sql_table, unload, unload_to_files 5 | from awswrangler.redshift._write import copy, copy_from_files, to_sql 6 | 7 | __all__ = [ 8 | "connect", 9 | "connect_temp", 10 | "copy", 11 | "copy_from_files", 12 | "read_sql_query", 13 | "read_sql_table", 14 | "to_sql", 15 | "unload", 16 | "unload_to_files", 17 | ] 18 | -------------------------------------------------------------------------------- /awswrangler/s3/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Read Module.""" 2 | 3 | from awswrangler.s3._copy import copy_objects, merge_datasets 4 | from awswrangler.s3._delete import delete_objects 5 | from awswrangler.s3._describe import describe_objects, get_bucket_region, size_objects 6 | from awswrangler.s3._download import download 7 | from awswrangler.s3._list import does_object_exist, list_buckets, list_directories, list_objects 8 | from awswrangler.s3._read_deltalake import read_deltalake 9 | from awswrangler.s3._read_excel import read_excel 10 | from awswrangler.s3._read_orc import read_orc, read_orc_metadata, read_orc_table 11 | from awswrangler.s3._read_parquet import read_parquet, read_parquet_metadata, read_parquet_table 12 | from awswrangler.s3._read_text import read_csv, read_fwf, read_json 13 | from awswrangler.s3._select import select_query 14 | from awswrangler.s3._upload import upload 15 | from awswrangler.s3._wait import wait_objects_exist, wait_objects_not_exist 16 | from awswrangler.s3._write_deltalake import to_deltalake 17 | from awswrangler.s3._write_excel import to_excel 18 | from awswrangler.s3._write_orc import to_orc 19 | from awswrangler.s3._write_parquet import store_parquet_metadata, to_parquet 20 | from awswrangler.s3._write_text import to_csv, to_json 21 | 22 | __all__ = [ 23 | "copy_objects", 24 | "merge_datasets", 25 | "delete_objects", 26 | "describe_objects", 27 | "get_bucket_region", 28 | "size_objects", 29 | "does_object_exist", 30 | "list_buckets", 31 | "list_directories", 32 | "list_objects", 33 | "read_deltalake", 34 | "read_parquet", 35 | "read_parquet_metadata", 36 | "read_parquet_table", 37 | "read_orc", 38 | "read_orc_metadata", 39 | "read_orc_table", 40 | "read_csv", 41 | "read_fwf", 42 | "read_json", 43 | "wait_objects_exist", 44 | "wait_objects_not_exist", 45 | "select_query", 46 | "store_parquet_metadata", 47 | "to_parquet", 48 | "to_orc", 49 | "to_csv", 50 | "to_json", 51 | "to_deltalake", 52 | "to_excel", 53 | "read_excel", 54 | "download", 55 | "upload", 56 | ] 57 | -------------------------------------------------------------------------------- /awswrangler/s3/_download.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Download Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any, cast 7 | 8 | import boto3 9 | 10 | from awswrangler.s3._fs import open_s3_object 11 | 12 | _logger: logging.Logger = logging.getLogger(__name__) 13 | 14 | 15 | def download( 16 | path: str, 17 | local_file: str | Any, 18 | version_id: str | None = None, 19 | use_threads: bool | int = True, 20 | boto3_session: boto3.Session | None = None, 21 | s3_additional_kwargs: dict[str, Any] | None = None, 22 | ) -> None: 23 | """Download file from a received S3 path to local file. 24 | 25 | Note 26 | ---- 27 | In case of `use_threads=True` the number of threads 28 | that will be spawned will be gotten from os.cpu_count(). 29 | 30 | Parameters 31 | ---------- 32 | path 33 | S3 path (e.g. ``s3://bucket/key0``). 34 | local_file 35 | A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``). 36 | version_id 37 | Version id of the object. 38 | use_threads 39 | True to enable concurrent requests, False to disable multiple threads. 40 | If enabled os.cpu_count() will be used as the max number of threads. 41 | If integer is provided, specified number is used. 42 | boto3_session 43 | Boto3 Session. The default boto3 session will be used if boto3_session receive None. 44 | s3_additional_kwargs 45 | Forward to botocore requests, only "SSECustomerAlgorithm", "SSECustomerKey" and "RequestPayer" 46 | arguments will be considered. 47 | 48 | Returns 49 | ------- 50 | None 51 | 52 | Examples 53 | -------- 54 | Downloading a file using a path to local file 55 | 56 | >>> import awswrangler as wr 57 | >>> wr.s3.download(path='s3://bucket/key', local_file='./key') 58 | 59 | Downloading a file using a file-like object 60 | 61 | >>> import awswrangler as wr 62 | >>> with open(file='./key', mode='wb') as local_f: 63 | >>> wr.s3.download(path='s3://bucket/key', local_file=local_f) 64 | 65 | """ 66 | _logger.debug("path: %s", path) 67 | with open_s3_object( 68 | path=path, 69 | mode="rb", 70 | use_threads=use_threads, 71 | version_id=version_id, 72 | s3_block_size=-1, # One shot download 73 | s3_additional_kwargs=s3_additional_kwargs, 74 | boto3_session=boto3_session, 75 | ) as s3_f: 76 | if isinstance(local_file, str): 77 | _logger.debug("Downloading local_file: %s", local_file) 78 | with open(file=local_file, mode="wb") as local_f: 79 | local_f.write(cast(bytes, s3_f.read())) 80 | else: 81 | _logger.debug("Downloading file-like object.") 82 | local_file.write(s3_f.read()) 83 | -------------------------------------------------------------------------------- /awswrangler/s3/_read_excel.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Excel Read Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any 7 | 8 | import boto3 9 | 10 | import awswrangler.pandas as pd 11 | from awswrangler import exceptions 12 | from awswrangler.s3._fs import open_s3_object 13 | 14 | _logger: logging.Logger = logging.getLogger(__name__) 15 | 16 | 17 | def read_excel( 18 | path: str, 19 | version_id: str | None = None, 20 | use_threads: bool | int = True, 21 | boto3_session: boto3.Session | None = None, 22 | s3_additional_kwargs: dict[str, Any] | None = None, 23 | **pandas_kwargs: Any, 24 | ) -> pd.DataFrame: 25 | """Read EXCEL file(s) from a received S3 path. 26 | 27 | Note 28 | ---- 29 | This function accepts any Pandas's read_excel() argument. 30 | https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html 31 | 32 | Note 33 | ---- 34 | Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library 35 | might have to be installed first. 36 | 37 | Note 38 | ---- 39 | In case of `use_threads=True` the number of threads 40 | that will be spawned will be gotten from os.cpu_count(). 41 | 42 | Parameters 43 | ---------- 44 | path 45 | S3 path (e.g. ``s3://bucket/key.xlsx``). 46 | version_id 47 | Version id of the object. 48 | use_threads 49 | True to enable concurrent requests, False to disable multiple threads. 50 | If enabled os.cpu_count() will be used as the max number of threads. 51 | If given an int will use the given amount of threads. 52 | If integer is provided, specified number is used. 53 | boto3_session 54 | Boto3 Session. The default boto3 session will be used if boto3_session receive None. 55 | s3_additional_kwargs 56 | Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. 57 | pandas_kwargs: 58 | KEYWORD arguments forwarded to pandas.read_excel(). You can NOT pass `pandas_kwargs` explicit, just add valid 59 | Pandas arguments in the function call and awswrangler will accept it. 60 | e.g. wr.s3.read_excel("s3://bucket/key.xlsx", na_rep="", verbose=True) 61 | https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html 62 | 63 | Returns 64 | ------- 65 | Pandas DataFrame. 66 | 67 | Examples 68 | -------- 69 | Reading an EXCEL file 70 | 71 | >>> import awswrangler as wr 72 | >>> df = wr.s3.read_excel('s3://bucket/key.xlsx') 73 | 74 | """ 75 | if "pandas_kwargs" in pandas_kwargs: 76 | raise exceptions.InvalidArgument( 77 | "You can NOT pass `pandas_kwargs` explicit, just add valid " 78 | "Pandas arguments in the function call and awswrangler will accept it." 79 | "e.g. wr.s3.read_excel('s3://bucket/key.xlsx', na_rep='', verbose=True)" 80 | ) 81 | with open_s3_object( 82 | path=path, 83 | mode="rb", 84 | version_id=version_id, 85 | use_threads=use_threads, 86 | s3_block_size=-1, # One shot download 87 | s3_additional_kwargs=s3_additional_kwargs, 88 | boto3_session=boto3_session, 89 | ) as f: 90 | _logger.debug("pandas_kwargs: %s", pandas_kwargs) 91 | return pd.read_excel(f, **pandas_kwargs) 92 | -------------------------------------------------------------------------------- /awswrangler/s3/_upload.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Upload Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any 7 | 8 | import boto3 9 | 10 | from awswrangler.s3._fs import open_s3_object 11 | 12 | _logger: logging.Logger = logging.getLogger(__name__) 13 | 14 | 15 | def upload( 16 | local_file: str | Any, 17 | path: str, 18 | use_threads: bool | int = True, 19 | boto3_session: boto3.Session | None = None, 20 | s3_additional_kwargs: dict[str, Any] | None = None, 21 | ) -> None: 22 | """Upload file from a local file to received S3 path. 23 | 24 | Note 25 | ---- 26 | In case of `use_threads=True` the number of threads 27 | that will be spawned will be gotten from os.cpu_count(). 28 | 29 | Parameters 30 | ---------- 31 | local_file 32 | A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``). 33 | path 34 | S3 path (e.g. ``s3://bucket/key0``). 35 | use_threads 36 | True to enable concurrent requests, False to disable multiple threads. 37 | If enabled os.cpu_count() will be used as the max number of threads. 38 | If integer is provided, specified number is used. 39 | boto3_session 40 | The default boto3 session will be used if boto3_session receive None. 41 | pyarrow_additional_kwargs 42 | Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. 43 | 44 | Returns 45 | ------- 46 | None 47 | 48 | Examples 49 | -------- 50 | Uploading a file using a path to local file 51 | 52 | >>> import awswrangler as wr 53 | >>> wr.s3.upload(local_file='./key', path='s3://bucket/key') 54 | 55 | Uploading a file using a file-like object 56 | 57 | >>> import awswrangler as wr 58 | >>> with open(file='./key', mode='wb') as local_f: 59 | >>> wr.s3.upload(local_file=local_f, path='s3://bucket/key') 60 | 61 | """ 62 | _logger.debug("path: %s", path) 63 | with open_s3_object( 64 | path=path, 65 | mode="wb", 66 | use_threads=use_threads, 67 | s3_block_size=-1, # One shot download 68 | s3_additional_kwargs=s3_additional_kwargs, 69 | boto3_session=boto3_session, 70 | ) as s3_f: 71 | if isinstance(local_file, str): 72 | _logger.debug("Uploading local_file: %s", local_file) 73 | with open(file=local_file, mode="rb") as local_f: 74 | s3_f.write(local_f.read()) # type: ignore[arg-type] 75 | else: 76 | _logger.debug("Uploading file-like object.") 77 | s3_f.write(local_file.read()) 78 | -------------------------------------------------------------------------------- /awswrangler/s3/_write_concurrent.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Concurrent Write Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import concurrent.futures 6 | import logging 7 | from typing import Any, Callable 8 | 9 | import pandas as pd 10 | 11 | from awswrangler import _utils 12 | 13 | _logger: logging.Logger = logging.getLogger(__name__) 14 | 15 | 16 | class _WriteProxy: 17 | def __init__(self, use_threads: bool | int): 18 | self._exec: concurrent.futures.ThreadPoolExecutor | None 19 | self._results: list[str] = [] 20 | self._cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) 21 | if self._cpus > 1: 22 | self._exec = concurrent.futures.ThreadPoolExecutor(max_workers=self._cpus) 23 | self._futures: list[Any] = [] 24 | else: 25 | self._exec = None 26 | 27 | @staticmethod 28 | def _caller(func: Callable[..., pd.DataFrame], *args: Any, func_kwargs: dict[str, Any]) -> pd.DataFrame: 29 | _logger.debug("Calling: %s", func) 30 | return func(*args, **func_kwargs) 31 | 32 | def write(self, func: Callable[..., list[str]], *args: Any, **func_kwargs: Any) -> None: 33 | """Write File.""" 34 | if self._exec is not None: 35 | _utils.block_waiting_available_thread(seq=self._futures, max_workers=self._cpus) 36 | _logger.debug("Submitting: %s", func) 37 | future = self._exec.submit( 38 | _WriteProxy._caller, 39 | func, 40 | *args, 41 | func_kwargs=func_kwargs, 42 | ) 43 | self._futures.append(future) 44 | else: 45 | self._results += func(*args, **func_kwargs) 46 | 47 | def close(self) -> list[str]: 48 | """Close the proxy.""" 49 | if self._exec is not None: 50 | for future in concurrent.futures.as_completed(self._futures): 51 | self._results += future.result() 52 | self._exec.shutdown(wait=True) 53 | return self._results 54 | -------------------------------------------------------------------------------- /awswrangler/s3/_write_excel.py: -------------------------------------------------------------------------------- 1 | """Amazon S3 Excel Write Module (PRIVATE).""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import Any 7 | 8 | import boto3 9 | import pandas as pd 10 | 11 | from awswrangler import exceptions 12 | from awswrangler.s3._fs import open_s3_object 13 | 14 | _logger: logging.Logger = logging.getLogger(__name__) 15 | 16 | 17 | def to_excel( 18 | df: pd.DataFrame, 19 | path: str, 20 | boto3_session: boto3.Session | None = None, 21 | s3_additional_kwargs: dict[str, Any] | None = None, 22 | use_threads: bool | int = True, 23 | **pandas_kwargs: Any, 24 | ) -> str: 25 | """Write EXCEL file on Amazon S3. 26 | 27 | Note 28 | ---- 29 | This function accepts any Pandas's read_excel() argument. 30 | https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html 31 | 32 | Note 33 | ---- 34 | Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library 35 | might have to be installed first. 36 | 37 | Note 38 | ---- 39 | In case of `use_threads=True` the number of threads 40 | that will be spawned will be gotten from os.cpu_count(). 41 | 42 | Parameters 43 | ---------- 44 | df 45 | Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html 46 | path 47 | Amazon S3 path (e.g. s3://bucket/filename.xlsx). 48 | boto3_session 49 | Boto3 Session. The default boto3 Session will be used if boto3_session receive None. 50 | pyarrow_additional_kwargs 51 | Forwarded to botocore requests. 52 | e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} 53 | use_threads 54 | True to enable concurrent requests, False to disable multiple threads. 55 | If enabled os.cpu_count() will be used as the max number of threads. 56 | If integer is provided, specified number is used. 57 | pandas_kwargs 58 | KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add 59 | valid Pandas arguments in the function call and awswrangler will accept it. 60 | e.g. wr.s3.to_excel(df, path, na_rep="", index=False) 61 | https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html 62 | 63 | Returns 64 | ------- 65 | Written S3 path. 66 | 67 | Examples 68 | -------- 69 | Writing EXCEL file 70 | 71 | >>> import awswrangler as wr 72 | >>> import pandas as pd 73 | >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx') 74 | 75 | """ 76 | if "pandas_kwargs" in pandas_kwargs: 77 | raise exceptions.InvalidArgument( 78 | "You can NOT pass `pandas_kwargs` explicit, just add valid " 79 | "Pandas arguments in the function call and awswrangler will accept it." 80 | "e.g. wr.s3.to_excel(df, path, na_rep=" 81 | ", index=False)" 82 | ) 83 | with open_s3_object( 84 | path=path, 85 | mode="wb", 86 | use_threads=use_threads, 87 | s3_additional_kwargs=s3_additional_kwargs, 88 | boto3_session=boto3_session, 89 | ) as f: 90 | _logger.debug("pandas_kwargs: %s", pandas_kwargs) 91 | df.to_excel(f, **pandas_kwargs) 92 | return path 93 | -------------------------------------------------------------------------------- /awswrangler/secretsmanager.py: -------------------------------------------------------------------------------- 1 | """Secrets Manager module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import base64 6 | import json 7 | import logging 8 | from typing import Any, Dict, cast 9 | 10 | import boto3 11 | 12 | from awswrangler import _utils 13 | 14 | _logger: logging.Logger = logging.getLogger(__name__) 15 | 16 | 17 | def get_secret(name: str, boto3_session: boto3.Session | None = None) -> str | bytes: 18 | """Get secret value. 19 | 20 | Parameters 21 | ---------- 22 | name 23 | Specifies the secret containing the version that you want to retrieve. 24 | You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret. 25 | boto3_session 26 | The default boto3 session will be used if **boto3_session** is ``None``. 27 | 28 | Returns 29 | ------- 30 | Secret value. 31 | 32 | Examples 33 | -------- 34 | >>> import awswrangler as wr 35 | >>> value = wr.secretsmanager.get_secret("my-secret") 36 | 37 | """ 38 | client = _utils.client(service_name="secretsmanager", session=boto3_session) 39 | response = client.get_secret_value(SecretId=name) 40 | if "SecretString" in response: 41 | return response["SecretString"] 42 | return base64.b64decode(response["SecretBinary"]) 43 | 44 | 45 | def get_secret_json(name: str, boto3_session: boto3.Session | None = None) -> dict[str, Any]: 46 | """Get JSON secret value. 47 | 48 | Parameters 49 | ---------- 50 | name 51 | Specifies the secret containing the version that you want to retrieve. 52 | You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret. 53 | boto3_session 54 | The default boto3 session will be used if **boto3_session** is ``None``. 55 | 56 | Returns 57 | ------- 58 | Secret JSON value parsed as a dictionary. 59 | 60 | Examples 61 | -------- 62 | >>> import awswrangler as wr 63 | >>> value = wr.secretsmanager.get_secret_json("my-secret-with-json-content") 64 | 65 | """ 66 | value = get_secret(name=name, boto3_session=boto3_session) 67 | return cast(Dict[str, Any], json.loads(value)) 68 | -------------------------------------------------------------------------------- /awswrangler/sts.py: -------------------------------------------------------------------------------- 1 | """STS module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | 7 | import boto3 8 | 9 | from awswrangler import _utils 10 | 11 | _logger: logging.Logger = logging.getLogger(__name__) 12 | 13 | 14 | def get_account_id(boto3_session: boto3.Session | None = None) -> str: 15 | """Get Account ID. 16 | 17 | Parameters 18 | ---------- 19 | boto3_session 20 | The default boto3 session will be used if **boto3_session** is ``None``. 21 | 22 | Returns 23 | ------- 24 | Account ID. 25 | 26 | Examples 27 | -------- 28 | >>> import awswrangler as wr 29 | >>> account_id = wr.sts.get_account_id() 30 | 31 | """ 32 | return _utils.client(service_name="sts", session=boto3_session).get_caller_identity()["Account"] 33 | 34 | 35 | def get_current_identity_arn(boto3_session: boto3.Session | None = None) -> str: 36 | """Get current user/role ARN. 37 | 38 | Parameters 39 | ---------- 40 | boto3_session 41 | The default boto3 session will be used if **boto3_session** is ``None``. 42 | 43 | Returns 44 | ------- 45 | User/role ARN. 46 | 47 | Examples 48 | -------- 49 | >>> import awswrangler as wr 50 | >>> arn = wr.sts.get_current_identity_arn() 51 | 52 | """ 53 | return _utils.client(service_name="sts", session=boto3_session).get_caller_identity()["Arn"] 54 | 55 | 56 | def get_current_identity_name(boto3_session: boto3.Session | None = None) -> str: 57 | """Get current user/role name. 58 | 59 | Parameters 60 | ---------- 61 | boto3_session 62 | The default boto3 session will be used if **boto3_session** is ``None``. 63 | 64 | Returns 65 | ------- 66 | User/role name. 67 | 68 | Examples 69 | -------- 70 | >>> import awswrangler as wr 71 | >>> name = wr.sts.get_current_identity_name() 72 | 73 | """ 74 | arn: str = get_current_identity_arn(boto3_session=boto3_session) 75 | name: str = arn.rpartition("/")[-1] 76 | return name 77 | -------------------------------------------------------------------------------- /awswrangler/timestream/__init__.py: -------------------------------------------------------------------------------- 1 | """Amazon Timestream Module.""" 2 | 3 | from awswrangler.timestream._create import create_database, create_table 4 | from awswrangler.timestream._delete import delete_database, delete_table 5 | from awswrangler.timestream._list import list_databases, list_tables 6 | from awswrangler.timestream._read import query, unload, unload_to_files 7 | from awswrangler.timestream._write import ( 8 | batch_load, 9 | batch_load_from_files, 10 | wait_batch_load_task, 11 | write, 12 | ) 13 | 14 | __all__ = [ 15 | "create_database", 16 | "create_table", 17 | "delete_database", 18 | "delete_table", 19 | "list_databases", 20 | "list_tables", 21 | "query", 22 | "write", 23 | "batch_load", 24 | "batch_load_from_files", 25 | "wait_batch_load_task", 26 | "unload_to_files", 27 | "unload", 28 | ] 29 | -------------------------------------------------------------------------------- /awswrangler/timestream/_delete.py: -------------------------------------------------------------------------------- 1 | """Amazon Timestream Delete Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | 7 | import boto3 8 | 9 | from awswrangler import _utils 10 | 11 | _logger: logging.Logger = logging.getLogger(__name__) 12 | 13 | 14 | def delete_database( 15 | database: str, 16 | boto3_session: boto3.Session | None = None, 17 | ) -> None: 18 | """Delete a given Timestream database. This is an irreversible operation. 19 | 20 | After a database is deleted, the time series data from its tables cannot be recovered. 21 | 22 | All tables in the database must be deleted first, or a ValidationException error will be thrown. 23 | 24 | Due to the nature of distributed retries, 25 | the operation can return either success or a ResourceNotFoundException. 26 | Clients should consider them equivalent. 27 | 28 | Parameters 29 | ---------- 30 | database 31 | Database name. 32 | boto3_session 33 | The default boto3 session will be used if **boto3_session** is ``None``. 34 | 35 | Examples 36 | -------- 37 | Deleting a database 38 | 39 | >>> import awswrangler as wr 40 | >>> arn = wr.timestream.delete_database("MyDatabase") 41 | 42 | """ 43 | _logger.info("Deleting Timestream database %s", database) 44 | client = _utils.client(service_name="timestream-write", session=boto3_session) 45 | client.delete_database(DatabaseName=database) 46 | 47 | 48 | def delete_table( 49 | database: str, 50 | table: str, 51 | boto3_session: boto3.Session | None = None, 52 | ) -> None: 53 | """Delete a given Timestream table. 54 | 55 | This is an irreversible operation. 56 | 57 | After a Timestream database table is deleted, the time series data stored in the table cannot be recovered. 58 | 59 | Due to the nature of distributed retries, 60 | the operation can return either success or a ResourceNotFoundException. 61 | Clients should consider them equivalent. 62 | 63 | Parameters 64 | ---------- 65 | database 66 | Database name. 67 | table 68 | Table name. 69 | boto3_session 70 | The default boto3 session will be used if **boto3_session** is ``None``. 71 | 72 | Examples 73 | -------- 74 | Deleting a table 75 | 76 | >>> import awswrangler as wr 77 | >>> arn = wr.timestream.delete_table("MyDatabase", "MyTable") 78 | 79 | """ 80 | _logger.info("Deleting Timestream table %s in database %s", table, database) 81 | client = _utils.client(service_name="timestream-write", session=boto3_session) 82 | client.delete_table(DatabaseName=database, TableName=table) 83 | -------------------------------------------------------------------------------- /awswrangler/timestream/_list.py: -------------------------------------------------------------------------------- 1 | """Amazon Timestream List Module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | 7 | import boto3 8 | 9 | from awswrangler import _utils 10 | 11 | _logger: logging.Logger = logging.getLogger(__name__) 12 | 13 | 14 | def list_databases( 15 | boto3_session: boto3.Session | None = None, 16 | ) -> list[str]: 17 | """ 18 | List all databases in timestream. 19 | 20 | Parameters 21 | ---------- 22 | boto3_session 23 | The default boto3 session will be used if **boto3_session** is ``None``. 24 | 25 | Returns 26 | ------- 27 | a list of available timestream databases. 28 | 29 | Examples 30 | -------- 31 | Querying the list of all available databases 32 | 33 | >>> import awswrangler as wr 34 | >>> wr.timestream.list_databases() 35 | ["database1", "database2"] 36 | 37 | """ 38 | client = _utils.client(service_name="timestream-write", session=boto3_session) 39 | 40 | response = client.list_databases() 41 | dbs: list[str] = [db["DatabaseName"] for db in response["Databases"]] 42 | while "NextToken" in response: 43 | response = client.list_databases(NextToken=response["NextToken"]) 44 | dbs += [db["DatabaseName"] for db in response["Databases"]] 45 | 46 | return dbs 47 | 48 | 49 | def list_tables(database: str | None = None, boto3_session: boto3.Session | None = None) -> list[str]: 50 | """ 51 | List tables in timestream. 52 | 53 | Parameters 54 | ---------- 55 | database 56 | Database name. If None, all tables in Timestream will be returned. Otherwise, only the tables inside the 57 | given database are returned. 58 | boto3_session 59 | The default boto3 session will be used if **boto3_session** is ``None``. 60 | 61 | Returns 62 | ------- 63 | A list of table names. 64 | 65 | Examples 66 | -------- 67 | Listing all tables in timestream across databases 68 | 69 | >>> import awswrangler as wr 70 | >>> wr.timestream.list_tables() 71 | ["table1", "table2"] 72 | 73 | Listing all tables in timestream in a specific database 74 | 75 | >>> import awswrangler as wr 76 | >>> wr.timestream.list_tables(DatabaseName="database1") 77 | ["table1"] 78 | 79 | """ 80 | client = _utils.client(service_name="timestream-write", session=boto3_session) 81 | args = {} if database is None else {"DatabaseName": database} 82 | response = client.list_tables(**args) # type: ignore[arg-type] 83 | tables: list[str] = [tbl["TableName"] for tbl in response["Tables"]] 84 | while "NextToken" in response: 85 | response = client.list_tables(**args, NextToken=response["NextToken"]) # type: ignore[arg-type] 86 | tables += [tbl["TableName"] for tbl in response["Tables"]] 87 | 88 | return tables 89 | -------------------------------------------------------------------------------- /awswrangler/timestream/_read.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterator, Literal, overload 2 | 3 | import boto3 4 | import pandas as pd 5 | 6 | @overload 7 | def query( 8 | sql: str, 9 | chunked: Literal[False] = ..., 10 | pagination_config: dict[str, Any] | None = ..., 11 | boto3_session: boto3.Session | None = ..., 12 | ) -> pd.DataFrame: ... 13 | @overload 14 | def query( 15 | sql: str, 16 | chunked: Literal[True], 17 | pagination_config: dict[str, Any] | None = ..., 18 | boto3_session: boto3.Session | None = ..., 19 | ) -> Iterator[pd.DataFrame]: ... 20 | @overload 21 | def query( 22 | sql: str, 23 | chunked: bool, 24 | pagination_config: dict[str, Any] | None = ..., 25 | boto3_session: boto3.Session | None = ..., 26 | ) -> pd.DataFrame | Iterator[pd.DataFrame]: ... 27 | def unload( 28 | sql: str, 29 | path: str, 30 | unload_format: Literal["CSV", "PARQUET"] | None = ..., 31 | compression: Literal["GZIP", "..."] | None = ..., 32 | partition_cols: list[str] | None = ..., 33 | encryption: Literal["SSE_KMS", "SSE_S3"] | None = ..., 34 | kms_key_id: str | None = ..., 35 | field_delimiter: str | None = ",", 36 | escaped_by: str | None = "\\", 37 | chunked: bool | int = False, 38 | keep_files: bool = False, 39 | use_threads: bool | int = True, 40 | boto3_session: boto3.Session | None = ..., 41 | s3_additional_kwargs: dict[str, str] | None = ..., 42 | pyarrow_additional_kwargs: dict[str, Any] | None = ..., 43 | ) -> pd.DataFrame | Iterator[pd.DataFrame]: ... 44 | def unload_to_files( 45 | sql: str, 46 | path: str, 47 | unload_format: Literal["CSV", "PARQUET"] | None = ..., 48 | compression: Literal["GZIP", "NONE"] | None = ..., 49 | partition_cols: list[str] | None = ..., 50 | encryption: Literal["SSE_KMS", "SSE_S3"] | None = ..., 51 | kms_key_id: str | None = ..., 52 | field_delimiter: str | None = ..., 53 | escaped_by: str | None = ..., 54 | boto3_session: boto3.Session | None = ..., 55 | ) -> None: ... 56 | -------------------------------------------------------------------------------- /building/build-docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | pushd .. 5 | rm -rf docs/build docs/source/stubs 6 | make -C docs/ html 7 | doc8 --ignore-path docs/source/stubs --max-line-length 120 docs/source 8 | -------------------------------------------------------------------------------- /building/build-lambda-layers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | VERSION=$(poetry version --short) 5 | DIR_NAME=$(dirname "$PWD") 6 | 7 | PYTHON_VERSION=${1:-ALL} 8 | 9 | ARCH=$(arch) 10 | [ "${ARCH}" = "aarch64" ] && ARCH_SUFFIX="-arm64" # AWS Lambda, the name arm64 is used instead of aarch64 11 | 12 | if [[ $PYTHON_VERSION == "ALL" ]] 13 | then 14 | echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ALL supported Python versions)" 15 | else 16 | echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ONLY Python $PYTHON_VERSION)" 17 | fi 18 | 19 | pushd lambda 20 | 21 | # Building all related docker images 22 | ./build-docker-images.sh $PYTHON_VERSION 23 | 24 | # Python 3.9 25 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]] 26 | then 27 | docker run \ 28 | --volume "$DIR_NAME":/aws-sdk-pandas/ \ 29 | --workdir /aws-sdk-pandas/building/lambda \ 30 | --rm \ 31 | awswrangler-build-py39 \ 32 | build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build" 33 | fi 34 | 35 | # Python 3.10 36 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]] 37 | then 38 | docker run \ 39 | --volume "$DIR_NAME":/aws-sdk-pandas/ \ 40 | --workdir /aws-sdk-pandas/building/lambda \ 41 | --rm \ 42 | awswrangler-build-py310 \ 43 | build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build" 44 | fi 45 | 46 | # Python 3.11 47 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]] 48 | then 49 | docker run \ 50 | --volume "$DIR_NAME":/aws-sdk-pandas/ \ 51 | --workdir /aws-sdk-pandas/building/lambda \ 52 | --rm \ 53 | awswrangler-build-py311 \ 54 | build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build" 55 | fi 56 | 57 | # Python 3.12 58 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]] 59 | then 60 | docker run \ 61 | --volume "$DIR_NAME":/aws-sdk-pandas/ \ 62 | --workdir /aws-sdk-pandas/building/lambda \ 63 | --rm \ 64 | awswrangler-build-py312 \ 65 | build-lambda-layer.sh "${VERSION}-py3.12${ARCH_SUFFIX}" "ninja-build" 66 | fi 67 | 68 | # Python 3.13 69 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.13" ]] 70 | then 71 | docker run \ 72 | --volume "$DIR_NAME":/aws-sdk-pandas/ \ 73 | --workdir /aws-sdk-pandas/building/lambda \ 74 | --rm \ 75 | awswrangler-build-py313 \ 76 | build-lambda-layer.sh "${VERSION}-py3.13${ARCH_SUFFIX}" "ninja-build" 77 | fi 78 | -------------------------------------------------------------------------------- /building/build-wheel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | pushd .. 5 | rm -rf dist/*.whl 6 | poetry build -f wheel 7 | -------------------------------------------------------------------------------- /building/lambda/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG base_image 2 | ARG python_version=base 3 | 4 | FROM ${base_image} AS base 5 | 6 | RUN yum install -y \ 7 | boost-devel \ 8 | jemalloc-devel \ 9 | libxml2-devel \ 10 | libxslt-devel \ 11 | bison \ 12 | make \ 13 | gcc10 \ 14 | gcc10-c++ \ 15 | flex \ 16 | autoconf \ 17 | zip \ 18 | git \ 19 | ninja-build 20 | 21 | WORKDIR /root 22 | 23 | ENV CC=/usr/bin/gcc10-cc 24 | ENV CXX=/usr/bin/gcc10-c++ 25 | ENV LD=/usr/bin/gcc10-gcc 26 | 27 | RUN ln -s /usr/bin/gcc10-gcc /usr/bin/gcc 28 | RUN ln -s /usr/bin/gcc10-g++ /usr/bin/g++ 29 | RUN ln -s /usr/bin/gcc10-nm /usr/bin/nm 30 | RUN ln -s /usr/bin/gcc10-ar /usr/bin/ar 31 | RUN ln -s /usr/bin/gcc10-mpn /usr/bin/mpn 32 | RUN ln -s /usr/bin/gcc10-ld /usr/bin/ld 33 | 34 | FROM ${python_version} 35 | COPY pyproject.toml poetry.lock ./ 36 | 37 | # Setuptools is a build dependency of arrow and runtime dependency of some of our dependencies (mainly redshift-connector). 38 | # Remove when arrow version shipped with lambda layers and dependencies are updated. 39 | RUN pip3 install --upgrade pip wheel setuptools>=78.1.1 setuptools_scm>=8 40 | RUN pip3 install --upgrade urllib3==1.26.16 # temporary to avoid https://github.com/urllib3/urllib3/issues/2168 (TODO remove when the AL2 image updates to support OpenSSL 1.1.1+) 41 | # In new CMake 4, compatibility with CMake < 3.5 has been removed. 42 | # Unpin CMake when arrow version shipped with lambda layers is updated. 43 | RUN pip3 install --upgrade cmake==3.31.6 44 | RUN pip3 install --upgrade six cython hypothesis poetry 45 | ENV PIP_NO_BINARY="numpy,pandas" 46 | RUN poetry config virtualenvs.create false --local && poetry install --no-root --only main 47 | 48 | RUN rm -f pyproject.toml poetry.lock 49 | 50 | ENTRYPOINT ["/bin/sh"] 51 | -------------------------------------------------------------------------------- /building/lambda/Dockerfile.al2023: -------------------------------------------------------------------------------- 1 | ARG base_image 2 | ARG python_version=base 3 | 4 | FROM ${base_image} AS base 5 | 6 | RUN dnf install -y \ 7 | boost-devel \ 8 | jemalloc-devel \ 9 | libxml2-devel \ 10 | libxslt-devel \ 11 | bison \ 12 | make \ 13 | gcc \ 14 | gcc-c++ \ 15 | flex \ 16 | autoconf \ 17 | zip \ 18 | git \ 19 | ninja-build 20 | 21 | WORKDIR /root 22 | 23 | FROM ${python_version} 24 | COPY pyproject.toml poetry.lock ./ 25 | 26 | # Setuptools is a build dependency of arrow and runtime dependency of some of our dependencies (mainly redshift-connector). 27 | # Remove when arrow version shipped with lambda layers and dependencies are updated. 28 | RUN pip3 install --upgrade pip wheel setuptools>=78.1.1 setuptools_scm>=8 29 | # In new CMake 4, compatibility with CMake < 3.5 has been removed. 30 | # Unpin CMake when arrow version shipped with lambda layers is updated. 31 | RUN pip3 install --upgrade cmake==3.31.6 32 | RUN pip3 install --upgrade six cython hypothesis poetry 33 | 34 | ENV PIP_NO_BINARY="numpy,pandas" 35 | RUN poetry config virtualenvs.create false --local && poetry install --no-root --only main 36 | 37 | RUN rm -f pyproject.toml poetry.lock 38 | 39 | ENTRYPOINT ["/bin/sh"] 40 | -------------------------------------------------------------------------------- /building/lambda/build-docker-images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | cp ../../pyproject.toml . 5 | cp ../../poetry.lock . 6 | 7 | export DOCKER_BUILDKIT=1 8 | 9 | PYTHON_VERSION=${1:-ALL} 10 | 11 | # Python 3.9 12 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]] 13 | then 14 | docker build \ 15 | --pull \ 16 | --tag awswrangler-build-py39 \ 17 | --build-arg base_image=public.ecr.aws/lambda/python:3.9 \ 18 | . 19 | fi 20 | 21 | # Python 3.10 22 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]] 23 | then 24 | docker build \ 25 | --pull \ 26 | --tag awswrangler-build-py310 \ 27 | --build-arg base_image=public.ecr.aws/lambda/python:3.10 \ 28 | . 29 | fi 30 | 31 | # Python 3.11 32 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]] 33 | then 34 | docker build \ 35 | --pull \ 36 | --tag awswrangler-build-py311 \ 37 | --build-arg base_image=public.ecr.aws/lambda/python:3.11 \ 38 | . 39 | fi 40 | 41 | # Python 3.12 42 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]] 43 | then 44 | docker build \ 45 | --pull \ 46 | --tag awswrangler-build-py312 \ 47 | --build-arg base_image=public.ecr.aws/lambda/python:3.12 \ 48 | --file Dockerfile.al2023 \ 49 | . 50 | fi 51 | 52 | # Python 3.13 53 | if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.13" ]] 54 | then 55 | docker build \ 56 | --pull \ 57 | --tag awswrangler-build-py313 \ 58 | --build-arg base_image=public.ecr.aws/lambda/python:3.13 \ 59 | --file Dockerfile.al2023 \ 60 | . 61 | fi 62 | 63 | rm -rf pyproject.toml poetry.lock 64 | -------------------------------------------------------------------------------- /building/lambda/build-lambda-layer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | FILENAME="awswrangler-layer-${1}.zip" 5 | NINJA=${2} 6 | 7 | pushd /aws-sdk-pandas 8 | rm -rf python dist/pyarrow_files "dist/${FILENAME}" "${FILENAME}" 9 | popd 10 | 11 | rm -rf dist arrow 12 | 13 | export ARROW_HOME=$(pwd)/dist 14 | export ARROW_VERSION=20.0.0 15 | export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH 16 | export CMAKE_PREFIX_PATH=$ARROW_HOME:$CMAKE_PREFIX_PATH 17 | export SETUPTOOLS_SCM_PRETEND_VERSION=$ARROW_VERSION 18 | 19 | git clone \ 20 | --depth 1 \ 21 | --branch "apache-arrow-${ARROW_VERSION}" \ 22 | --single-branch \ 23 | https://github.com/apache/arrow.git 24 | 25 | mkdir $ARROW_HOME 26 | mkdir arrow/cpp/build 27 | pushd arrow/cpp/build 28 | 29 | cmake \ 30 | -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ 31 | -DCMAKE_INSTALL_LIBDIR=lib \ 32 | -DARROW_PYTHON=ON \ 33 | -DARROW_PARQUET=ON \ 34 | -DARROW_DATASET=ON \ 35 | -DARROW_WITH_SNAPPY=ON \ 36 | -DARROW_WITH_ZLIB=ON \ 37 | -DARROW_FLIGHT=OFF \ 38 | -DARROW_GANDIVA=OFF \ 39 | -DARROW_ORC=OFF \ 40 | -DARROW_CSV=ON \ 41 | -DARROW_JSON=ON \ 42 | -DARROW_COMPUTE=ON \ 43 | -DARROW_FILESYSTEM=ON \ 44 | -DARROW_PLASMA=OFF \ 45 | -DARROW_WITH_BZ2=OFF \ 46 | -DARROW_WITH_ZSTD=OFF \ 47 | -DARROW_WITH_LZ4=OFF \ 48 | -DARROW_WITH_BROTLI=OFF \ 49 | -DARROW_BUILD_TESTS=OFF \ 50 | -GNinja \ 51 | .. 52 | 53 | eval $NINJA 54 | eval "${NINJA} install" 55 | 56 | popd 57 | 58 | pushd arrow/python 59 | 60 | export CMAKE_PREFIX_PATH=${ARROW_HOME}${CMAKE_PREFIX_PATH:+:${CMAKE_PREFIX_PATH}} 61 | export ARROW_PRE_0_15_IPC_FORMAT=0 62 | export PYARROW_WITH_HDFS=0 63 | export PYARROW_WITH_FLIGHT=0 64 | export PYARROW_WITH_GANDIVA=0 65 | export PYARROW_WITH_ORC=0 66 | export PYARROW_WITH_CUDA=0 67 | export PYARROW_WITH_PLASMA=0 68 | export PYARROW_WITH_PARQUET=1 69 | export PYARROW_WITH_DATASET=1 70 | export PYARROW_WITH_FILESYSTEM=1 71 | export PYARROW_WITH_CSV=1 72 | export PYARROW_WITH_JSON=1 73 | export PYARROW_WITH_COMPUTE=1 74 | 75 | python3 setup.py build_ext \ 76 | --build-type=release \ 77 | --bundle-arrow-cpp \ 78 | bdist_wheel 79 | 80 | pip3 install dist/pyarrow-*.whl -t /aws-sdk-pandas/dist/pyarrow_files 81 | 82 | popd 83 | 84 | pushd /aws-sdk-pandas 85 | 86 | pip3 install . --no-binary numpy,pandas -t ./python ".[redshift,mysql,postgres,gremlin,opensearch,openpyxl]" 87 | 88 | rm -rf python/pyarrow* 89 | rm -rf python/boto* 90 | rm -rf python/urllib3* 91 | rm -rf python/s3transfer* 92 | 93 | cp -r /aws-sdk-pandas/dist/pyarrow_files/pyarrow* python/ 94 | 95 | # Removing nonessential files 96 | find python -name '*.so' -type f -exec strip "{}" \; 97 | find python -wholename "*/tests/*" -type f -delete 98 | find python -regex '^.*\(__pycache__\|\.py[co]\)$' -delete 99 | 100 | zip -r9 "${FILENAME}" ./python 101 | mv "${FILENAME}" dist/ 102 | 103 | rm -rf python dist/pyarrow_files "${FILENAME}" 104 | 105 | popd 106 | 107 | rm -rf dist arrow 108 | -------------------------------------------------------------------------------- /building/publish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | pushd .. 5 | rm -fr dist 6 | poetry publish --build 7 | rm -fr dist 8 | -------------------------------------------------------------------------------- /building/update-glue-lib.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | pushd .. 5 | rm -fr awswrangler.zip 6 | 7 | # Zip the library 8 | git archive HEAD:awswrangler --format zip --prefix awswrangler/awswrangler/ --output awswrangler.zip 9 | 10 | # Upload the Zip file 11 | s3_location=$(aws cloudformation describe-stacks --stack-name aws-sdk-pandas-glueray --query "Stacks[0].Outputs[?OutputKey=='AWSSDKforpandasZIPLocation'].OutputValue" --output text) 12 | aws s3 cp awswrangler.zip $s3_location 13 | 14 | rm -fr awswrangler.zip 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - python>=3 5 | - pandoc 6 | - ipykernel 7 | - pip 8 | - pip: 9 | - myst_parser 10 | - nbsphinx 11 | - nbsphinx-link 12 | - sphinx==7.1.2 13 | - sphinx-autodoc-typehints 14 | - sphinx_bootstrap_theme 15 | - sphinx-copybutton 16 | - IPython 17 | - .. 18 | -------------------------------------------------------------------------------- /docs/source/_ext/copy_adr.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | 4 | 5 | def setup(app): 6 | file_dir = Path(__file__).parent 7 | 8 | source_dir = file_dir.joinpath("../../../adr").resolve() 9 | destination_dir = file_dir.joinpath("../adr/").resolve() 10 | 11 | for file in source_dir.glob("*.md"): 12 | shutil.copy(file, destination_dir) 13 | -------------------------------------------------------------------------------- /docs/source/_ext/copy_tutorials.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | 5 | def setup(app): 6 | file_dir = Path(__file__).parent 7 | for f in file_dir.joinpath("../../../tutorials").glob("*.ipynb"): 8 | with open(file_dir.joinpath(f"../tutorials/{f.stem}.nblink"), "w") as output_file: 9 | nb_link = {"path": f"../../../tutorials/{f.name}", "extra-media": ["../../../tutorials/_static"]} 10 | json.dump(nb_link, output_file) 11 | -------------------------------------------------------------------------------- /docs/source/_static/aws_lambda_managed_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/aws_lambda_managed_layer.png -------------------------------------------------------------------------------- /docs/source/_static/css/max_width.css: -------------------------------------------------------------------------------- 1 | div.body { 2 | max-width: 90%; 3 | } 4 | -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo.png -------------------------------------------------------------------------------- /docs/source/_static/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo2.png -------------------------------------------------------------------------------- /docs/source/_static/logo_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo_transparent.png -------------------------------------------------------------------------------- /docs/source/_static/logo_transparent_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/logo_transparent_small.png -------------------------------------------------------------------------------- /docs/source/_static/ssm_public_parameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_static/ssm_public_parameters.png -------------------------------------------------------------------------------- /docs/source/_templates/globaltoc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/docs/source/_templates/globaltoc.html -------------------------------------------------------------------------------- /docs/source/_templates/typed-dict-template.rst: -------------------------------------------------------------------------------- 1 | {{ objname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | :show-inheritance: 8 | 9 | {% block attributes_summary %} 10 | {% if attributes %} 11 | 12 | .. rubric:: Attributes 13 | 14 | .. autosummary:: 15 | {% for item in attributes %} 16 | ~{{ name }}.{{ item }} 17 | {%- endfor %} 18 | 19 | {% endif %} 20 | {% endblock %} 21 | 22 | {% block methods_documentation %} 23 | {% if methods %} 24 | 25 | .. rubric:: Attributes Documentation 26 | 27 | {% for item in attributes %} 28 | .. autoattribute:: {{ item }} 29 | {%- endfor %} 30 | 31 | {% endif %} 32 | {% endblock %} 33 | -------------------------------------------------------------------------------- /docs/source/about.rst: -------------------------------------------------------------------------------- 1 | What is AWS SDK for pandas? 2 | ============================ 3 | 4 | An `AWS Professional Service `_ `open source `_ python initiative that extends the power of the `pandas `_ library to AWS, connecting **DataFrames** and AWS data & analytics services. 5 | 6 | Easy integration with Athena, Glue, Redshift, Timestream, OpenSearch, Neptune, QuickSight, Chime, CloudWatchLogs, 7 | DynamoDB, EMR, SecretManager, PostgreSQL, MySQL, SQLServer and S3 (Parquet, CSV, JSON and EXCEL). 8 | 9 | Built on top of other open-source projects like `Pandas `_, `Apache Arrow `_ and `Boto3 `_, it offers abstracted functions to execute your usual ETL tasks like load/unloading data from **Data Lakes**, **Data Warehouses** and **Databases**, even `at scale `_. 10 | 11 | Check our `tutorials `_ or the `list of functionalities `_. 12 | -------------------------------------------------------------------------------- /docs/source/adr.rst: -------------------------------------------------------------------------------- 1 | Architectural Decision Records 2 | ============================== 3 | 4 | A collection of records for "architecturally significant" decisions: 5 | those that affect the structure, non-functional characteristics, dependencies, interfaces, or construction techniques. 6 | 7 | These decisions are made by the team which maintains *AWS SDK for pandas*. 8 | However, suggestions can be submitted by any contributor via issues or pull requests. 9 | 10 | .. note:: You can also find all ADRs on `GitHub `_. 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :glob: 15 | 16 | adr/* 17 | -------------------------------------------------------------------------------- /docs/source/adr/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | An `AWS Professional Service `_ open source initiative | aws-proserve-opensource@amazon.com 2 | 3 | Quick Start 4 | ----------- 5 | 6 | >>> pip install awswrangler 7 | 8 | >>> # Optional modules are installed with: 9 | >>> pip install 'awswrangler[redshift]' 10 | 11 | .. code-block:: py3 12 | 13 | import awswrangler as wr 14 | import pandas as pd 15 | from datetime import datetime 16 | 17 | df = pd.DataFrame({"id": [1, 2], "value": ["foo", "boo"]}) 18 | 19 | # Storing data on Data Lake 20 | wr.s3.to_parquet( 21 | df=df, 22 | path="s3://bucket/dataset/", 23 | dataset=True, 24 | database="my_db", 25 | table="my_table" 26 | ) 27 | 28 | # Retrieving the data directly from Amazon S3 29 | df = wr.s3.read_parquet("s3://bucket/dataset/", dataset=True) 30 | 31 | # Retrieving the data from Amazon Athena 32 | df = wr.athena.read_sql_query("SELECT * FROM my_table", database="my_db") 33 | 34 | # Get a Redshift connection from Glue Catalog and retrieving data from Redshift Spectrum 35 | con = wr.redshift.connect("my-glue-connection") 36 | df = wr.redshift.read_sql_query("SELECT * FROM external_schema.my_table", con=con) 37 | con.close() 38 | 39 | # Amazon Timestream Write 40 | df = pd.DataFrame({ 41 | "time": [datetime.now(), datetime.now()], 42 | "my_dimension": ["foo", "boo"], 43 | "measure": [1.0, 1.1], 44 | }) 45 | rejected_records = wr.timestream.write(df, 46 | database="sampleDB", 47 | table="sampleTable", 48 | time_col="time", 49 | measure_col="measure", 50 | dimensions_cols=["my_dimension"], 51 | ) 52 | 53 | # Amazon Timestream Query 54 | wr.timestream.query(""" 55 | SELECT time, measure_value::double, my_dimension 56 | FROM "sampleDB"."sampleTable" ORDER BY time DESC LIMIT 3 57 | """) 58 | 59 | Read The Docs 60 | ------------- 61 | 62 | .. toctree:: 63 | :maxdepth: 2 64 | 65 | about 66 | install 67 | scale 68 | tutorials 69 | adr 70 | api 71 | Community Resources 72 | Logging 73 | Who uses AWS SDK for pandas? 74 | License 75 | Contributing 76 | 77 | .. image:: https://d3tiqpr4kkkomd.cloudfront.net/img/pixel.png?asset=RIXAH6KDSYAI1HHEBLTY 78 | :align: left 79 | -------------------------------------------------------------------------------- /docs/source/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | .. note:: You can also find all Tutorial Notebooks on `GitHub `_. 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :glob: 9 | 10 | tutorials/* 11 | -------------------------------------------------------------------------------- /docs/source/tutorials/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /fix.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | ruff format . 5 | ruff check --fix . -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | versions=${1:-ALL} 3 | posargs=${2:-32} 4 | SECONDS=0 5 | 6 | set -e 7 | 8 | mkdir -p test-reports 9 | tox -e ${versions} -- ${posargs} 10 | if [ $versions = "ALL" ]; then 11 | coverage html --directory coverage 12 | rm -rf .coverage* Running 2> /dev/null 13 | fi 14 | 15 | duration=$SECONDS 16 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 17 | -------------------------------------------------------------------------------- /test_infra/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | from aws_cdk import App, Environment 5 | from stacks.base_stack import BaseStack 6 | from stacks.cleanrooms_stack import CleanRoomsStack 7 | from stacks.databases_stack import DatabasesStack 8 | from stacks.glueray_stack import GlueRayStack 9 | from stacks.opensearch_stack import OpenSearchStack 10 | 11 | app = App() 12 | 13 | env = {"env": Environment(account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"])} 14 | 15 | base = BaseStack( 16 | app, 17 | "aws-sdk-pandas-base", 18 | **env, 19 | ) 20 | 21 | DatabasesStack( 22 | app, 23 | "aws-sdk-pandas-databases", 24 | base.get_vpc, 25 | base.get_bucket, 26 | base.get_key, 27 | **env, 28 | ) 29 | 30 | OpenSearchStack( 31 | app, 32 | "aws-sdk-pandas-opensearch", 33 | base.get_vpc, 34 | base.get_bucket, 35 | base.get_key, 36 | **env, 37 | ) 38 | 39 | GlueRayStack( 40 | app, 41 | "aws-sdk-pandas-glueray", 42 | base.get_bucket, 43 | **env, 44 | ) 45 | 46 | CleanRoomsStack( 47 | app, 48 | "aws-sdk-pandas-cleanrooms", 49 | base.get_bucket, 50 | **env, 51 | ) 52 | 53 | app.synth() 54 | -------------------------------------------------------------------------------- /test_infra/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 23 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 24 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 25 | "@aws-cdk/aws-iam:minimizePolicies": true, 26 | "@aws-cdk/core:target-partitions": [ 27 | "aws", 28 | "aws-cn" 29 | ], 30 | "databases": { 31 | "redshift": true, 32 | "postgresql": true, 33 | "mysql": true, 34 | "sqlserver": false, 35 | "oracle": false, 36 | "neptune": false 37 | }, 38 | "network": "public" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /test_infra/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "awswrangler - test infrastructure" 3 | version = "3.12.0" 4 | description = "CDK test infrastructure for AWS SDK for pandas" 5 | authors = ["Amazon Web Services"] 6 | license = "Apache License 2.0" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9, <4.0" 10 | "aws-cdk-lib" = "^2.188.0" 11 | "aws-cdk.aws-glue-alpha" = "^2.188.0a0" 12 | "aws-cdk.aws-neptune-alpha" = "^2.188.0a0" 13 | "aws-cdk.aws-redshift-alpha" = "^2.188.0a0" 14 | -------------------------------------------------------------------------------- /test_infra/scripts/delete-stack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | STACK=${1} 4 | 5 | pushd .. 6 | cdk destroy aws-sdk-pandas-${STACK} 7 | popd -------------------------------------------------------------------------------- /test_infra/scripts/deploy-stack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | STACK=${1} 4 | 5 | pushd .. 6 | cdk bootstrap 7 | cdk deploy aws-sdk-pandas-${STACK} 8 | popd -------------------------------------------------------------------------------- /test_infra/scripts/security-group-databases-add-local-ip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Get my current IP address 5 | LOCALIP=`host myip.opendns.com resolver1.opendns.com | grep myip | awk '{print $4}'` 6 | 7 | # Get security group ID 8 | SGID=`aws cloudformation describe-stacks --stack-name aws-sdk-pandas-databases --query "Stacks[0].Outputs[?OutputKey=='DatabaseSecurityGroupId'].OutputValue" --output text` 9 | 10 | # Update Security Group with local ip 11 | aws ec2 authorize-security-group-ingress \ 12 | --group-id ${SGID} \ 13 | --protocol all \ 14 | --port -1 \ 15 | --cidr ${LOCALIP}/32 16 | -------------------------------------------------------------------------------- /test_infra/scripts/security-group-databases-check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Get security group ID 5 | SGID=`aws cloudformation describe-stacks --stack-name aws-sdk-pandas-databases --query "Stacks[0].Outputs[?OutputKey=='DatabaseSecurityGroupId'].OutputValue" --output text` 6 | 7 | # Check to see current setting 8 | aws ec2 describe-security-groups --group-id ${SGID} 9 | -------------------------------------------------------------------------------- /test_infra/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /test_infra/stacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/test_infra/stacks/__init__.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/benchmark/__init__.py -------------------------------------------------------------------------------- /tests/glue_scripts/ray_read_small_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import ray 4 | 5 | import awswrangler as wr 6 | 7 | paths = wr.s3.list_objects(f"s3://{os.environ['data-gen-bucket']}/parquet/small/partitioned/") 8 | ray.data.read_parquet_bulk(paths=paths, override_num_blocks=1000).to_modin() 9 | -------------------------------------------------------------------------------- /tests/glue_scripts/wrangler_blog_simple.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import awswrangler as wr 4 | 5 | workgroup_name = os.environ["athena-workgroup"] 6 | output_path = os.environ["output-path"] 7 | glue_database = os.environ["glue-database"] 8 | glue_table = os.environ["glue-table"] 9 | 10 | # Read 1.5 Gb Parquet data 11 | df = wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2017/") 12 | 13 | # Drop vendor_id column 14 | df.drop("vendor_id", axis=1, inplace=True) 15 | 16 | # Filter trips over 1 mile 17 | df1 = df[df["trip_distance"] > 1] 18 | 19 | # Write partitioned trips to S3 in Parquet format 20 | wr.s3.to_parquet( 21 | df1, 22 | path=f"{output_path}output/{glue_table}/", 23 | partition_cols=["passenger_count", "payment_type"], 24 | dataset=True, 25 | database=glue_database, 26 | table=glue_table, 27 | ) 28 | 29 | # Read the data back to a modin df via Athena 30 | df1_athena = wr.athena.read_sql_query( 31 | f"SELECT * FROM {glue_table}", 32 | database=glue_database, 33 | ctas_approach=False, 34 | unload_approach=True, 35 | workgroup=workgroup_name, 36 | s3_output=f"{output_path}unload/{glue_table}/", 37 | ) 38 | 39 | # Delete table (required due to LF) 40 | wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) 41 | -------------------------------------------------------------------------------- /tests/glue_scripts/wrangler_read_small_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import awswrangler as wr 4 | 5 | wr.s3.read_parquet( 6 | path=f"s3://{os.environ['data-gen-bucket']}/parquet/small/partitioned/", 7 | ray_args={"override_num_blocks": 1000, "bulk_read": True}, 8 | ) 9 | -------------------------------------------------------------------------------- /tests/glue_scripts/wrangler_write_partitioned_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import awswrangler as wr 4 | 5 | df = wr.s3.read_parquet( 6 | path=f"s3://{os.environ['data-gen-bucket']}/parquet/medium/partitioned/", 7 | ray_args={"override_num_blocks": 1000}, 8 | ) 9 | 10 | wr.s3.to_parquet( 11 | df=df, 12 | path=os.environ["output-path"], 13 | dataset=True, 14 | partition_cols=["payment_type", "passenger_count"], 15 | ) 16 | -------------------------------------------------------------------------------- /tests/load/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/load/__init__.py -------------------------------------------------------------------------------- /tests/load/conftest.py: -------------------------------------------------------------------------------- 1 | import modin.pandas as pd 2 | import pytest 3 | import ray 4 | from pyarrow import csv 5 | 6 | import awswrangler as wr 7 | 8 | 9 | @pytest.fixture(scope="function") 10 | def df_timestream() -> pd.DataFrame: 11 | # Data frame with 126_000 rows 12 | return ( 13 | ray.data.read_csv( 14 | "https://raw.githubusercontent.com/awslabs/amazon-timestream-tools/mainline/sample_apps/data/sample.csv", 15 | **{ 16 | "read_options": csv.ReadOptions( 17 | column_names=[ 18 | "ignore0", 19 | "region", 20 | "ignore1", 21 | "az", 22 | "ignore2", 23 | "hostname", 24 | "measure_kind", 25 | "measure", 26 | "ignore3", 27 | "ignore4", 28 | "ignore5", 29 | ] 30 | ) 31 | }, 32 | ) 33 | .to_modin() 34 | .loc[:, ["region", "az", "hostname", "measure_kind", "measure"]] 35 | ) 36 | 37 | 38 | @pytest.fixture(scope="function") 39 | def df_s() -> pd.DataFrame: 40 | # Data frame with 100000 rows 41 | return wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2010/02/data.parquet") 42 | 43 | 44 | @pytest.fixture(scope="function") 45 | def df_xl() -> pd.DataFrame: 46 | # Data frame with 8759874 rows 47 | return wr.s3.read_parquet(path="s3://ursa-labs-taxi-data/2018/01/data.parquet") 48 | 49 | 50 | @pytest.fixture(scope="function") 51 | def big_modin_df() -> pd.DataFrame: 52 | pandas_refs = ray.data.range(100_000).to_pandas_refs() 53 | dataset = ray.data.from_pandas_refs(pandas_refs) 54 | 55 | frame = dataset.to_modin() 56 | frame["foo"] = frame.id * 2 57 | frame["bar"] = frame.id % 2 58 | 59 | return frame 60 | -------------------------------------------------------------------------------- /tests/load/test_dynamodb.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | from typing import Any 5 | 6 | import boto3 7 | import modin.pandas as pd 8 | import pytest 9 | import ray 10 | 11 | import awswrangler as wr 12 | 13 | from .._utils import ExecutionTimer 14 | 15 | 16 | def _generate_item(id: int) -> dict[str, Any]: 17 | return { 18 | "id": str(id), 19 | "year": random.randint(1923, 2023), 20 | "title": f"{random.randrange(16**6):06x}", 21 | } 22 | 23 | 24 | def _fill_dynamodb_table(table_name: str, num_objects: int) -> None: 25 | dynamodb_resource = boto3.resource("dynamodb") 26 | table = dynamodb_resource.Table(table_name) 27 | 28 | with table.batch_writer() as writer: 29 | for i in range(num_objects): 30 | item = _generate_item(i) 31 | writer.put_item(Item=item) 32 | 33 | 34 | def create_big_modin_df(table_size: int, num_blocks: int | None) -> pd.DataFrame: 35 | pandas_refs = ray.data.range(table_size).to_pandas_refs() 36 | dataset = ray.data.from_pandas_refs(pandas_refs) 37 | 38 | if num_blocks: 39 | dataset = dataset.repartition(num_blocks=num_blocks) 40 | 41 | frame = dataset.to_modin() 42 | frame["foo"] = frame.id * 2 43 | frame["bar"] = frame.id % 2 44 | 45 | return frame 46 | 47 | 48 | @pytest.mark.parametrize( 49 | "params", 50 | [ 51 | { 52 | "KeySchema": [{"AttributeName": "id", "KeyType": "HASH"}, {"AttributeName": "year", "KeyType": "RANGE"}], 53 | "AttributeDefinitions": [ 54 | {"AttributeName": "id", "AttributeType": "S"}, 55 | {"AttributeName": "year", "AttributeType": "N"}, 56 | ], 57 | } 58 | ], 59 | ) 60 | def test_dynamodb_read(params: dict[str, Any], dynamodb_table: str, request: pytest.FixtureRequest) -> None: 61 | benchmark_time = 30 62 | num_objects = 50_000 63 | 64 | _fill_dynamodb_table(dynamodb_table, num_objects) 65 | 66 | with ExecutionTimer(request) as timer: 67 | frame = wr.dynamodb.read_items(table_name=dynamodb_table, allow_full_scan=True) 68 | 69 | assert len(frame) == num_objects 70 | assert timer.elapsed_time < benchmark_time 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "params", 75 | [ 76 | { 77 | "KeySchema": [{"AttributeName": "id", "KeyType": "HASH"}], 78 | "AttributeDefinitions": [ 79 | {"AttributeName": "id", "AttributeType": "N"}, 80 | ], 81 | } 82 | ], 83 | ) 84 | @pytest.mark.parametrize("num_blocks", [2, 4, 8, None]) 85 | def test_dynamodb_write( 86 | params: dict[str, Any], 87 | num_blocks: int, 88 | dynamodb_table: str, 89 | request: pytest.FixtureRequest, 90 | ) -> None: 91 | benchmark_time = 30 92 | big_modin_df = create_big_modin_df(25_000, num_blocks) 93 | 94 | with ExecutionTimer(request) as timer: 95 | wr.dynamodb.put_df(df=big_modin_df, table_name=dynamodb_table, use_threads=4) 96 | 97 | assert timer.elapsed_time < benchmark_time 98 | 99 | df_out = wr.dynamodb.read_items(dynamodb_table, allow_full_scan=True) 100 | assert len(df_out) == len(big_modin_df) 101 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_athena_geospatial.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | import pandas as pd 3 | import shapely 4 | 5 | import awswrangler as wr 6 | 7 | 8 | def test_athena_geospatial(path, glue_table, glue_database): 9 | df = wr.athena.read_sql_query( 10 | """ 11 | SELECT 12 | 1 AS value 13 | , ST_Point(-121.7602, 46.8527) AS point 14 | , ST_LineFromText('LINESTRING(1 2, 3 4)') AS line 15 | , ST_Polygon('POLYGON ((1 1, 1 4, 4 4, 4 1))') AS polygon 16 | , ST_Polygon('POLYGON EMPTY') AS polygon_empty 17 | """, 18 | database=glue_database, 19 | ctas_approach=False, 20 | ) 21 | 22 | assert isinstance(df, geopandas.GeoDataFrame) 23 | 24 | assert isinstance(df["value"], pd.Series) 25 | assert isinstance(df["point"], geopandas.GeoSeries) 26 | assert isinstance(df["line"], geopandas.GeoSeries) 27 | assert isinstance(df["polygon"], geopandas.GeoSeries) 28 | assert isinstance(df["polygon_empty"], geopandas.GeoSeries) 29 | 30 | assert isinstance(df["point"][0], shapely.geometry.point.Point) 31 | assert isinstance(df["line"][0], shapely.geometry.linestring.LineString) 32 | assert isinstance(df["polygon"][0], shapely.geometry.polygon.Polygon) 33 | assert isinstance(df["polygon_empty"][0], shapely.geometry.polygon.Polygon) 34 | -------------------------------------------------------------------------------- /tests/unit/test_athena_spark.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import awswrangler as wr 4 | from tests._utils import create_workgroup 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def athena_spark_execution_role_arn(cloudformation_outputs): 9 | return cloudformation_outputs["AthenaSparkExecutionRoleArn"] 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def workgroup_spark(bucket, kms_key, athena_spark_execution_role_arn): 14 | return create_workgroup( 15 | wkg_name="aws_sdk_pandas_spark", 16 | config={ 17 | "EngineVersion": { 18 | "SelectedEngineVersion": "PySpark engine version 3", 19 | }, 20 | "ExecutionRole": athena_spark_execution_role_arn, 21 | "ResultConfiguration": {"OutputLocation": f"s3://{bucket}/athena_workgroup_spark/"}, 22 | }, 23 | ) 24 | 25 | 26 | @pytest.mark.parametrize( 27 | "code", 28 | [ 29 | "print(spark)", 30 | """ 31 | input_path = "s3://athena-examples-us-east-1/notebooks/yellow_tripdata_2016-01.parquet" 32 | output_path = "$PATH" 33 | 34 | taxi_df = spark.read.format("parquet").load(input_path) 35 | 36 | taxi_passenger_counts = taxi_df.groupBy("VendorID", "passenger_count").count() 37 | taxi_passenger_counts.coalesce(1).write.mode('overwrite').csv(output_path) 38 | """, 39 | ], 40 | ) 41 | def test_athena_spark_calculation(code, path, workgroup_spark): 42 | code = code.replace("$PATH", path) 43 | 44 | result = wr.athena.run_spark_calculation( 45 | code=code, 46 | workgroup=workgroup_spark, 47 | ) 48 | 49 | assert result["Status"]["State"] == "COMPLETED" 50 | 51 | 52 | @pytest.mark.parametrize( 53 | "code", 54 | [ 55 | """ 56 | output_path = "$PATH" 57 | 58 | data = spark.range(0, 5) 59 | data.write.format("delta").save(output_path) 60 | """, 61 | ], 62 | ) 63 | def test_athena_spark_calculation_with_spark_properties(code, path, workgroup_spark): 64 | code = code.replace("$PATH", path) 65 | 66 | result = wr.athena.run_spark_calculation( 67 | code=code, 68 | workgroup=workgroup_spark, 69 | spark_properties={ 70 | "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", 71 | "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", 72 | }, 73 | ) 74 | assert result["Status"]["State"] == "COMPLETED" 75 | -------------------------------------------------------------------------------- /tests/unit/test_chime.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | import awswrangler as wr 6 | 7 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 8 | 9 | 10 | def test_chime_bad_input(): 11 | with pytest.raises(ValueError): 12 | result = wr.chime.post_message(message=None, webhook=None) 13 | assert result is None 14 | -------------------------------------------------------------------------------- /tests/unit/test_cleanrooms.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import awswrangler as wr 4 | import awswrangler.pandas as pd 5 | 6 | from .._utils import is_ray_modin 7 | 8 | pytestmark = pytest.mark.distributed 9 | 10 | 11 | @pytest.fixture() 12 | def data(bucket: str, cleanrooms_glue_database_name: str) -> None: 13 | df_purchases = pd.DataFrame( 14 | { 15 | "purchase_id": list(range(100, 109)), 16 | "user_id": [1, 2, 3, 1, 2, 3, 4, 5, 6], 17 | "sale_value": [2.2, 1.1, 6.2, 2.3, 7.8, 9.9, 7.3, 9.7, 0.7], 18 | } 19 | ) 20 | wr.s3.to_parquet( 21 | df_purchases, 22 | f"s3://{bucket}/purchases/", 23 | dataset=True, 24 | database=cleanrooms_glue_database_name, 25 | table="purchases", 26 | mode="overwrite", 27 | ) 28 | 29 | df_users = pd.DataFrame( 30 | { 31 | "user_id": list(range(1, 9)), 32 | "city": ["LA", "NYC", "Chicago", "NYC", "NYC", "LA", "Seattle", "Seattle"], 33 | } 34 | ) 35 | wr.s3.to_parquet( 36 | df_users, 37 | f"s3://{bucket}/users/", 38 | dataset=True, 39 | database=cleanrooms_glue_database_name, 40 | table="users", 41 | mode="overwrite", 42 | ) 43 | 44 | df_custom = pd.DataFrame( 45 | { 46 | "a": list(range(1, 9)), 47 | "b": ["A", "A", "B", "C", "C", "C", "D", "E"], 48 | } 49 | ) 50 | wr.s3.to_parquet( 51 | df_custom, 52 | f"s3://{bucket}/custom/", 53 | dataset=True, 54 | database=cleanrooms_glue_database_name, 55 | table="custom", 56 | mode="overwrite", 57 | ) 58 | 59 | 60 | @pytest.mark.xfail( 61 | is_ray_modin, raises=AssertionError, reason="Upgrade from pyarrow 16.1 to 17 causes AssertionError in Modin" 62 | ) 63 | def test_read_sql_query( 64 | data: None, 65 | cleanrooms_membership_id: str, 66 | cleanrooms_analysis_template_arn: str, 67 | bucket: str, 68 | ): 69 | sql = """SELECT city, AVG(p.sale_value) 70 | FROM users u 71 | INNER JOIN purchases p ON u.user_id = p.user_id 72 | GROUP BY city 73 | """ 74 | chunksize = 2 75 | df_chunked = wr.cleanrooms.read_sql_query( 76 | sql=sql, 77 | membership_id=cleanrooms_membership_id, 78 | output_bucket=bucket, 79 | output_prefix="results", 80 | chunksize=chunksize, 81 | keep_files=False, 82 | ) 83 | for df in df_chunked: 84 | assert df.shape == (chunksize, 2) 85 | 86 | sql = """SELECT COUNT(p.purchase_id), SUM(p.sale_value), city 87 | FROM users u 88 | INNER JOIN purchases p ON u.user_id = p.user_id 89 | GROUP BY city 90 | """ 91 | df = wr.cleanrooms.read_sql_query( 92 | sql=sql, 93 | membership_id=cleanrooms_membership_id, 94 | output_bucket=bucket, 95 | output_prefix="results", 96 | keep_files=False, 97 | ) 98 | assert df.shape == (2, 3) 99 | 100 | df = wr.cleanrooms.read_sql_query( 101 | analysis_template_arn=cleanrooms_analysis_template_arn, 102 | params={"param1": "C"}, 103 | membership_id=cleanrooms_membership_id, 104 | output_bucket=bucket, 105 | output_prefix="results", 106 | keep_files=False, 107 | ) 108 | assert df.shape == (3, 1) 109 | -------------------------------------------------------------------------------- /tests/unit/test_distributed.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from importlib import reload 3 | from types import ModuleType 4 | from typing import Iterator 5 | 6 | import pytest 7 | 8 | from .._utils import is_ray_modin 9 | 10 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 11 | 12 | pytestmark = pytest.mark.distributed 13 | 14 | 15 | @pytest.fixture(scope="function") 16 | def wr() -> Iterator[ModuleType]: 17 | import awswrangler 18 | 19 | awswrangler.engine.__class__._engine = None 20 | awswrangler.engine.__class__._initialized_engine = None 21 | awswrangler.engine.__class__._registry.clear() 22 | 23 | yield reload(awswrangler) 24 | 25 | # Reset for future tests 26 | awswrangler.engine.set(awswrangler.engine.get_installed().value) 27 | awswrangler.memory_format.set(awswrangler.memory_format.get_installed().value) 28 | 29 | 30 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available") 31 | def test_engine_initialization(wr: ModuleType, path: str) -> None: 32 | assert wr.engine.is_initialized() 33 | 34 | 35 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available") 36 | def test_engine_python(wr: ModuleType) -> None: 37 | from awswrangler._distributed import EngineEnum 38 | from awswrangler.s3._write_parquet import _to_parquet 39 | 40 | assert wr.engine.get_installed() == EngineEnum.RAY 41 | assert wr.engine.get() == EngineEnum.RAY 42 | 43 | wr.engine.set(EngineEnum.PYTHON.value) 44 | 45 | assert wr.engine.get() == EngineEnum.PYTHON 46 | 47 | assert not wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed") 48 | 49 | 50 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available") 51 | def test_engine_ray(wr: ModuleType) -> None: 52 | from awswrangler._distributed import EngineEnum 53 | from awswrangler.s3._write_parquet import _to_parquet 54 | 55 | assert wr.engine.get_installed() == EngineEnum.RAY 56 | assert wr.engine.get() == EngineEnum.RAY 57 | 58 | assert wr.engine._registry 59 | assert wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed") 60 | assert not wr.engine.dispatch_func(_to_parquet, "python").__name__.endswith("distributed") 61 | 62 | 63 | @pytest.mark.skipif(condition=is_ray_modin, reason="ray is installed") 64 | def test_engine_python_without_ray_installed(wr: ModuleType) -> None: 65 | from awswrangler._distributed import EngineEnum 66 | from awswrangler.s3._write_parquet import _to_parquet 67 | 68 | assert wr.engine.get_installed() == EngineEnum.PYTHON 69 | assert wr.engine.get() == EngineEnum.PYTHON 70 | 71 | assert not wr.engine.dispatch_func(_to_parquet).__name__.endswith("distributed") 72 | 73 | 74 | @pytest.mark.skipif(condition=not is_ray_modin, reason="ray not available") 75 | def test_engine_switch(wr: ModuleType) -> None: 76 | from modin.pandas import DataFrame as ModinDataFrame 77 | from pandas import DataFrame as PandasDataFrame 78 | 79 | assert wr.engine.get_installed() == wr.EngineEnum.RAY 80 | assert wr.memory_format.get_installed() == wr.MemoryFormatEnum.MODIN 81 | 82 | assert wr.engine.get() == wr.EngineEnum.RAY 83 | assert wr.memory_format.get() == wr.MemoryFormatEnum.MODIN 84 | assert wr.pandas.DataFrame == ModinDataFrame 85 | 86 | wr.engine.set("python") 87 | wr.memory_format.set("pandas") 88 | 89 | assert wr.engine.get() == wr.EngineEnum.PYTHON 90 | assert wr.memory_format.get() == wr.MemoryFormatEnum.PANDAS 91 | assert wr.pandas.DataFrame == PandasDataFrame 92 | -------------------------------------------------------------------------------- /tests/unit/test_glue.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | import awswrangler as wr 6 | import awswrangler.pandas as pd 7 | 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 9 | 10 | pytestmark = pytest.mark.distributed 11 | 12 | 13 | def test_parquet_crawler_columns(path): 14 | df = pd.DataFrame({"c0": [0, 1], "c1": [2, 3]}) 15 | wr.s3.to_parquet(df, path, dataset=True, mode="overwrite") 16 | df = pd.DataFrame({"c1": [2, 3], "c0": [0, 1]}) 17 | wr.s3.to_parquet(df, path, dataset=True, mode="append") 18 | first_schema = wr.s3.read_parquet_metadata(path=path)[0] 19 | for _ in range(10): 20 | schema = wr.s3.read_parquet_metadata(path=path)[0] 21 | assert list(schema.keys()) == list(first_schema.keys()) 22 | -------------------------------------------------------------------------------- /tests/unit/test_metadata.py: -------------------------------------------------------------------------------- 1 | import awswrangler as wr 2 | 3 | 4 | def test_metadata(): 5 | assert wr.__version__ == "3.12.0" 6 | assert wr.__title__ == "awswrangler" 7 | assert wr.__description__ == "Pandas on AWS." 8 | assert wr.__license__ == "Apache License 2.0" 9 | -------------------------------------------------------------------------------- /tests/unit/test_s3_excel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | import awswrangler as wr 6 | import awswrangler.pandas as pd 7 | 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 9 | 10 | 11 | @pytest.mark.parametrize("ext", ["xlsx", "xlsm", "xls", "odf"]) 12 | @pytest.mark.parametrize("use_threads", [True, False, 2]) 13 | def test_excel(path, ext, use_threads): 14 | df = pd.DataFrame({"c0": [1, 2, 3], "c1": ["foo", "boo", "bar"]}) 15 | file_path = f"{path}0.{ext}" 16 | pandas_kwargs = {} 17 | 18 | with pytest.raises(wr.exceptions.InvalidArgument): 19 | wr.s3.to_excel(df, file_path, use_threads=use_threads, index=False, pandas_kwargs=pandas_kwargs) 20 | 21 | wr.s3.to_excel(df, file_path, use_threads=use_threads, index=False, **pandas_kwargs) 22 | 23 | with pytest.raises(wr.exceptions.InvalidArgument): 24 | wr.s3.read_excel(file_path, use_threads=use_threads, pandas_kwargs=pandas_kwargs) 25 | 26 | df2 = wr.s3.read_excel(file_path, use_threads=use_threads, **pandas_kwargs) 27 | assert df.equals(df2) 28 | 29 | 30 | def test_read_xlsx_versioned(path) -> None: 31 | path_file = f"{path}0.xlsx" 32 | dfs = [pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5]}), pd.DataFrame({"c0": [3, 4, 5], "c1": [6, 7, 8]})] 33 | pandas_kwargs = {} 34 | for df in dfs: 35 | wr.s3.to_excel(df=df, path=path_file, index=False, **pandas_kwargs) 36 | version_id = wr.s3.describe_objects(path=path_file)[path_file]["VersionId"] 37 | df_temp = wr.s3.read_excel(path_file, version_id=version_id, **pandas_kwargs) 38 | assert df_temp.equals(df) 39 | assert version_id == wr.s3.describe_objects(path=path_file, version_id=version_id)[path_file]["VersionId"] 40 | -------------------------------------------------------------------------------- /tests/unit/test_s3_wait.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | import awswrangler as wr 6 | import awswrangler.pandas as pd 7 | 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 9 | 10 | pytestmark = pytest.mark.distributed 11 | 12 | 13 | @pytest.mark.parametrize("use_threads", [True, False]) 14 | def test_wait_object_exists_single_file(path: str, use_threads: bool) -> None: 15 | df = pd.DataFrame({"FooBoo": [1, 2, 3]}) 16 | file_path = f"{path}data.csv" 17 | 18 | wr.s3.to_csv(df, file_path) 19 | 20 | wr.s3.wait_objects_exist(paths=[file_path], use_threads=use_threads) 21 | 22 | 23 | @pytest.mark.parametrize("use_threads", [True, False]) 24 | def test_wait_object_exists_multiple_files(path: str, use_threads: bool) -> None: 25 | df = pd.DataFrame({"FooBoo": [1, 2, 3]}) 26 | 27 | file_paths = [f"{path}data.csv", f"{path}data2.csv", f"{path}data3.csv"] 28 | for file_path in file_paths: 29 | wr.s3.to_csv(df, file_path) 30 | 31 | wr.s3.wait_objects_exist(paths=file_paths, use_threads=use_threads) 32 | 33 | 34 | @pytest.mark.parametrize("use_threads", [True, False]) 35 | def test_wait_object_not_exists(path: str, use_threads: bool) -> None: 36 | wr.s3.wait_objects_not_exist(paths=[path], use_threads=use_threads) 37 | 38 | 39 | @pytest.mark.parametrize("use_threads", [True, False]) 40 | @pytest.mark.timeout(30) 41 | def test_wait_object_timeout(path: str, use_threads: bool) -> None: 42 | with pytest.raises(wr.exceptions.NoFilesFound): 43 | wr.s3.wait_objects_exist( 44 | paths=[path], 45 | use_threads=use_threads, 46 | delay=0.5, 47 | max_attempts=3, 48 | ) 49 | 50 | 51 | @pytest.mark.parametrize("use_threads", [True, False]) 52 | def test_wait_object_exists_empty_list(use_threads: bool) -> None: 53 | wr.s3.wait_objects_exist(paths=[]) 54 | -------------------------------------------------------------------------------- /tests/unit/test_session.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import boto3 5 | 6 | import awswrangler as wr 7 | 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 9 | 10 | 11 | def test_default_session(): 12 | boto3.setup_default_session(region_name="us-east-1") 13 | assert wr._utils.ensure_session().region_name == "us-east-1" 14 | boto3.setup_default_session(region_name="us-east-2") 15 | assert wr._utils.ensure_session().region_name == "us-east-2" 16 | boto3.setup_default_session(region_name="us-west-1") 17 | assert wr._utils.ensure_session().region_name == "us-west-1" 18 | boto3.setup_default_session(region_name=os.environ.get("AWS_DEFAULT_REGION", "us-west-2")) 19 | assert wr._utils.ensure_session().region_name == os.environ.get("AWS_DEFAULT_REGION", "us-west-2") 20 | -------------------------------------------------------------------------------- /tests/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pytest 5 | 6 | from awswrangler._utils import ensure_cpu_count, get_even_chunks_sizes 7 | 8 | logging.getLogger("awswrangler").setLevel(logging.DEBUG) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "total_size,chunk_size,upper_bound,result", 13 | [ 14 | (10, 4, True, (4, 3, 3)), 15 | (2, 3, True, (2,)), 16 | (1, 1, True, (1,)), 17 | (2, 1, True, (1, 1)), 18 | (11, 4, True, (4, 4, 3)), 19 | (1_001, 500, True, (334, 334, 333)), 20 | (1_002, 500, True, (334, 334, 334)), 21 | (10, 4, False, (5, 5)), 22 | (1, 1, False, (1,)), 23 | (2, 1, False, (1, 1)), 24 | (11, 4, False, (6, 5)), 25 | (1_001, 500, False, (501, 500)), 26 | (1_002, 500, False, (501, 501)), 27 | ], 28 | ) 29 | def test_get_even_chunks_sizes(total_size, chunk_size, upper_bound, result): 30 | assert get_even_chunks_sizes(total_size, chunk_size, upper_bound) == result 31 | 32 | 33 | @pytest.mark.parametrize("use_threads,result", [(True, os.cpu_count()), (False, 1), (-1, 1), (1, 1), (5, 5)]) 34 | def test_ensure_cpu_count(use_threads, result): 35 | assert ensure_cpu_count(use_threads=use_threads) == result 36 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{39,310,311,312,313} 3 | isolated_build = True 4 | 5 | [testenv] 6 | passenv = 7 | AWS_PROFILE 8 | AWS_DEFAULT_REGION 9 | AWS_ACCESS_KEY_ID 10 | AWS_SECRET_ACCESS_KEY 11 | AWS_SESSION_TOKEN 12 | setenv = 13 | COV_FAIL_UNDER = 87.00 14 | allowlist_externals = 15 | pytest 16 | poetry 17 | commands_pre = 18 | poetry install --no-root --sync --extras "deltalake gremlin mysql opencypher opensearch oracle postgres redshift sparql sqlserver geopandas" 19 | commands = 20 | pytest -n {posargs} -s -v --timeout=300 --reruns=2 --reruns-delay=15 \ 21 | --cov=awswrangler --cov-report=xml --cov-report term-missing --cov-branch \ 22 | --cov-fail-under={env:COV_FAIL_UNDER} \ 23 | --dist load --maxschedchunk 2 \ 24 | --junitxml=test-reports/junit.xml --log-file=test-reports/logs.txt tests/unit 25 | 26 | [testenv:py{39,310,311,312,313}-distributed] 27 | passenv = 28 | AWS_PROFILE 29 | AWS_DEFAULT_REGION 30 | AWS_ACCESS_KEY_ID 31 | AWS_SECRET_ACCESS_KEY 32 | AWS_SESSION_TOKEN 33 | setenv = 34 | COV_FAIL_UNDER = 74.00 35 | WR_CPU_COUNT = 16 36 | allowlist_externals = poetry 37 | commands_pre = 38 | poetry install --no-root --sync --all-extras 39 | commands = 40 | {[testenv]commands} 41 | -------------------------------------------------------------------------------- /tutorials/020 - Spark Table Interoperability.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", 8 | "\n", 9 | "# 20 - Spark Table Interoperability\n", 10 | "\n", 11 | "[awswrangler](https://github.com/aws/aws-sdk-pandas) has no difficulty to insert, overwrite or do any other kind of interaction with a Table created by Apache Spark.\n", 12 | "\n", 13 | "But if you want to do the opposite (Spark interacting with a table created by awswrangler) you should be aware that awswrangler follows the Hive's format and you must be explicit when using the Spark's `saveAsTable` method:" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "spark_df.write.format(\"hive\").saveAsTable(\"database.table\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Or just move forward using the `insertInto` alternative:" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "spark_df.write.insertInto(\"database.table\")" 39 | ] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3.9.14", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.9.14" 59 | }, 60 | "pycharm": { 61 | "stem_cell": { 62 | "cell_type": "raw", 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "source": [] 67 | } 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 4 72 | } 73 | -------------------------------------------------------------------------------- /tutorials/_static/glue_catalog_table_products.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_table_products.png -------------------------------------------------------------------------------- /tutorials/_static/glue_catalog_version_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_version_0.png -------------------------------------------------------------------------------- /tutorials/_static/glue_catalog_version_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_catalog_version_1.png -------------------------------------------------------------------------------- /tutorials/_static/glue_is_create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_is_create.png -------------------------------------------------------------------------------- /tutorials/_static/glue_is_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/glue_is_setup.png -------------------------------------------------------------------------------- /tutorials/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-sdk-pandas/8754e6a53004cfda7627afbc59bf467bf500a005/tutorials/_static/logo.png -------------------------------------------------------------------------------- /validate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | ruff format --check . 5 | ruff check . 6 | mypy --install-types --non-interactive awswrangler 7 | doc8 --ignore-path docs/source/stubs --max-line-length 120 docs/source 8 | poetry check --lock 9 | --------------------------------------------------------------------------------