├── .gitignore ├── AUTHORS ├── CHANGELOG.md ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── capability_automated_ingest_filesystem_scanner.jpg ├── capability_automated_ingest_landing_zone.jpg ├── docker ├── demo │ ├── README.md │ ├── compose.yaml │ ├── ingest_celery_workers │ │ └── Dockerfile │ ├── irods_catalog │ │ ├── Dockerfile │ │ └── init-user-db.sh │ └── irods_catalog_provider │ │ ├── Dockerfile │ │ ├── entrypoint.sh │ │ └── setup.input └── ingest-test │ ├── README.md │ ├── docker-compose.yml │ ├── icat │ ├── Dockerfile │ └── postgres_init.sh │ ├── icommands.env │ ├── provider │ ├── Dockerfile │ ├── db_commands.txt │ ├── irods_4.2_provider.input │ ├── irods_4.3_provider.input │ └── start_provider.sh │ ├── run_tests.sh │ └── test │ ├── Dockerfile │ ├── Dockerfile.pure │ ├── irods_environment.json │ └── run_tests.sh ├── irods_capability_automated_ingest ├── __init__.py ├── celery.py ├── char_map_util.py ├── core.py ├── custom_event_handler.py ├── examples │ ├── __init__.py │ ├── append.py │ ├── append_non_leaf_non_root_with_resc_name.py │ ├── append_root_with_resc_name.py │ ├── append_with_resc_name.py │ ├── coll_create_pre_and_post.py │ ├── coll_modify_pre_and_post.py │ ├── data_obj_create_pre_and_post.py │ ├── data_obj_modify_pre_and_post.py │ ├── metadata.py │ ├── no_op.py │ ├── no_retry.py │ ├── post_job.py │ ├── pre_job.py │ ├── put.py │ ├── put_non_leaf_non_root_with_resc_name.py │ ├── put_root_with_resc_name.py │ ├── put_using_char_map.py │ ├── put_with_resc_name.py │ ├── register.py │ ├── register_non_leaf_non_root_with_resc_name.py │ ├── register_root_with_resc_name.py │ ├── register_using_char_map.py │ ├── register_with_peps.py │ ├── register_with_resc_name.py │ ├── replica_root_with_resc_name.py │ ├── replica_with_non_leaf_non_root_resc_name.py │ ├── replica_with_resc_name.py │ ├── retry.py │ ├── statistics.py │ ├── sync.py │ ├── sync_non_leaf_non_root_with_resc_name.py │ ├── sync_retry.py │ ├── sync_root_with_resc_name.py │ ├── sync_with_resc_name.py │ └── timeout.py ├── irods │ ├── __init__.py │ ├── filesystem.py │ ├── irods_utils.py │ └── s3_bucket.py ├── irods_sync.py ├── redis_key.py ├── redis_utils.py ├── sync_actions.py ├── sync_job.py ├── sync_logging.py ├── task_queue.py ├── tasks │ ├── __init__.py │ ├── delete_tasks.py │ ├── filesystem_tasks.py │ ├── irods_task.py │ └── s3_bucket_tasks.py ├── test │ ├── __init__.py │ ├── test_delete_modes.py │ ├── test_irods_sync.py │ ├── test_lib.py │ └── test_s3_bucket_scan.py ├── utils.py └── version.py ├── profile ├── README.md ├── profile.css ├── profile.html ├── profile.js └── profile.py ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *egg-info/ 3 | cscope.* 4 | tags 5 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Terrell Russell maintains this project for the iRODS Consortium. 2 | 3 | Hao Xu wrote the first implementation. 4 | Antoine de Torcy wrote the prototype. 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project **only** adheres to the following _(as defined at [Semantic Versioning](https://semver.org/spec/v2.0.0.html))_: 7 | 8 | > Given a version number MAJOR.MINOR.PATCH, increment the: 9 | > 10 | > - MAJOR version when you make incompatible API changes 11 | > - MINOR version when you add functionality in a backward compatible manner 12 | > - PATCH version when you make backward compatible bug fixes 13 | 14 | ## [0.6.0] - 2024-10-14 15 | 16 | This release adds the ability to delete data objects and collections from iRODS which DO NOT exist in the source being ingested. The feature exposes two new events and an event handler method for controlling the delete mode. 17 | 18 | ### Changed 19 | 20 | - Improve documentation (#23, #137, #140, #150, #183, #214, #257, #289). 21 | - Display more information about jobs in `list` subcommand output (#91). 22 | - Improve testing, Docker demo, and code formatting (#100, #132, #180). 23 | - Replace Celery application and tasks (#211). 24 | - Remove /tmp mount directory from Docker test harness (#235). 25 | - Bump PRC dependency version to <3.0.0 (#263). 26 | - Refactor and clean up code (#180, #272, #274). 27 | - Bump Celery dependency version to <6.0.0 (#266). 28 | - Improve handling of irods_session (#269). 29 | - Disable mingling for Celery works in tests (#280). 30 | - Replace use of "sync" with "tasks" in Celery tasks names (#281). 31 | 32 | ### Removed 33 | 34 | - Remove --append_json option (#60). 35 | - Remove unnecessary directories and files (#245, #246, #247, #248, #262). 36 | 37 | ### Fixed 38 | 39 | - Handle KeyboardInterrupt for `watch` subcommand (#93). 40 | - Use logical path for redis_lock key in create_dirs (#124). 41 | - Do not allow data transfers to redirect by default (#276). 42 | - Do not add unreadable files to sync chunks (#277). 43 | - Restore syncs from S3 bucket to iRODS (#285). 44 | - Fix periodic task name (#293). 45 | 46 | ### Added 47 | 48 | - Add Delete mode (#48, #261, #288). 49 | - Track start time of sync jobs (#92). 50 | - Track jobs stopped by "irods_sync stop" (#210). 51 | - Add Docker Compose project for testing (#244). 52 | 53 | ## [v0.5.0] - 2024-07-17 54 | 55 | This release adds more functionality when scanning an S3 bucket 56 | and updates the testing harness and a number of dependencies. 57 | 58 | Note: The signatures for all pre/post event handler methods 59 | (e.g. `pre_data_obj_create`) have been changed to include an 60 | `*args` parameter. Any existing event handler files will need 61 | to be updated by inserting the new parameter just before the 62 | `**options` keyword argument: 63 | 64 | ```diff 65 | @staticmethod 66 | - def pre_data_obj_create(hdlr_mod, logger, session, meta, **options): 67 | + def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 68 | ``` 69 | 70 | - [#180] Add tags to gitignore 71 | - [#219] Add tests for pre/post event handler methods 72 | - [#219] Add *args to all example pre/post event handler methods 73 | - [#180] Rename 'syncer' to 'scanner_instance' 74 | - Revert "[#219] Add 'op' and 'scanner' to meta" 75 | - Revert "[#180] Rename 'syncer' to 'scanner_instance'" 76 | - Bump certifi from 2023.7.22 to 2024.7.4 77 | - [#180] Rename 'syncer' to 'scanner_instance' 78 | - [#219] Add 'op' and 'scanner' to meta 79 | - [#222] Use %-formatting in log statement 80 | - Bump urllib3 from 1.26.18 to 1.26.19 81 | - [#232] Update deployment instructions in README 82 | - [#216] Remove non-Compose test instructions 83 | - [#174] Update Redis instructions in README 84 | - Bump werkzeug from 2.3.8 to 3.0.3 85 | - Bump jinja2 from 3.1.3 to 3.1.4 86 | - Bump certifi from 2022.12.7 to 2023.7.22 87 | - Bump urllib3 from 1.26.5 to 1.26.18 88 | - Bump jinja2 from 2.11.3 to 3.1.3 89 | - Bump flask from 1.0.2 to 2.2.5 90 | - Bump werkzeug from 2.2.3 to 2.3.8 91 | - Bump redis from 2.10.6 to 4.4.4 92 | - [#215] Fix test failures 93 | - [#220] Update test environment 94 | - [#180] Update supported Python versions 95 | - [#180] black formatter - no functional changes 96 | - [#212] changed REGISTER to REGISTER_SYNC 97 | - [#207] multi read and write from S3 to iRODS for put, putsync 98 | - [#129] Added functionality for PUT, PUT_SYNC with S3 via Minio 99 | - [#129] put_sync functionality for data in S3 100 | 101 | ## [v0.4.2] - 2023-06-26 102 | 103 | This release fixes the exclude and post_job behavior 104 | and updates two dependencies. 105 | 106 | - [#200] Add --exclude_file_type test 107 | - [#201] Amend test for post_job 108 | - [#195] apply CELERY_BROKER_URL env var globally to tests 109 | - [#198] update to Python 3.11 in docker test suite 110 | - [#201] Fix job done condition 111 | - [#200] Fix exclude_file_name/exclude_file_type 112 | - [#200] Add test for --exclude_file_name 113 | - Bump certifi from 2018.11.29 to 2022.12.7 114 | - Bump werkzeug from 0.15.3 to 2.2.3 115 | 116 | ## [v0.4.1] - 2023-03-26 117 | 118 | This release fixes an exit code bug and adds a 119 | character_map event handler method. 120 | 121 | - [#188] eliminate exit call in check_event_handler 122 | - [#40][#166] tests work for unicodeEncodeError and char_map put/register 123 | - [#166] implement object path character remapping (with AVU hints) 124 | - [#180] add .gitignore 125 | - [#177] Fix wrong exit code with --synchronous option 126 | 127 | ## [v0.4.0] - 2022-02-24 128 | 129 | This release abstracts the scanners, eases deployment 130 | by putting the event handler in redis, provides better 131 | SSL support, and now requires Python 3.7+. 132 | 133 | - [#171] Un-skip tests with resolved issues 134 | - [#167] Bump versions in setup.py and test image 135 | - [#170] Fix tests to use event_handler files 136 | - Bump celery from 4.2.1 to 5.2.2 137 | - Bump urllib3 from 1.24.2 to 1.26.5 138 | - Bump jinja2 from 2.10 to 2.11.3 139 | - [#102] event_handler goes into redis 140 | - [#159] add performance benchmark test harness 141 | - [#147][#157] Allow running workers with env only 142 | - [#156] modified test to use resc_hier string 143 | - [#155] added helper for unicode errors and renamed variables 144 | - [#110] Add several interfaces for refactor 145 | - [irods/python-irodsclient#237] load certificate into ssl context 146 | - fixed the parsing of the S3 region parameter 147 | - Bump werkzeug from 0.14.1 to 0.15.3 148 | - [#125] Add non-SSL connection option for S3 149 | - [#86][#117] Test suite cleanup + docker image 150 | - Correct README.md for docker instructions 151 | - [#109] Update docker steps for Celery 152 | - [#114] Remove zone hint check 153 | - [#90] Honor CELERY_BROKER_URL when present 154 | 155 | ## [v0.3.8] - 2019-11-12 156 | 157 | This release fixes handling of stopped periodic jobs 158 | 159 | - [#103] revoke scheduled celery restart jobs on stop 160 | 161 | ## [v0.3.7] - 2019-08-27 162 | 163 | This release fixes a prefix handling bug when scanning S3. 164 | 165 | - [#98] Preserve trailing slash for S3 prefix 166 | 167 | ## [v0.3.6] - 2019-08-14 168 | 169 | This release fixes a path registration bug when scanning 170 | S3 and updates a dependency. 171 | 172 | - Bump urllib3 from 1.24.1 to 1.24.2 173 | - [#95] Replaced lstrip with index and offset 174 | 175 | ## [v0.3.5] - 2019-04-10 176 | 177 | This release adds support for non utf-8 filenames 178 | and tests for code coverage. 179 | 180 | - [#88] Limit Celery version 181 | - [#63] make easier to test against a non-default zone 182 | - [#63] Add more UnicodeEncodeError tests 183 | - [#51] Add tests for event handler PEPs 184 | - [#31] Handle invalid zone name in target coll 185 | - [#31] Add test for invalid zone name 186 | - [#76] Add max redis version and requirements.txt 187 | - [#40] Handle UnicodeEncodeError filenames for PUT 188 | - [#40] Add tests for non-encodeable filename 189 | - [#78] Add documentation around VM overcommitting 190 | 191 | ## [v0.3.4] - 2018-11-15 192 | 193 | - [#76] Pin redis version to 2.10.6 194 | 195 | ## [v0.3.3] - 2018-10-27 196 | 197 | - [#75] Honor SSL parameters in irods_environment.json 198 | 199 | ## [v0.3.2] - 2018-09-25 200 | 201 | - [#69] Don't follow symlinks to dirs 202 | 203 | ## [v0.3.1] - 2018-09-20 204 | 205 | - [#49] Fix S3 syncing dir and registering folder 206 | 207 | ## [v0.3.0] - 2018-09-19 208 | 209 | This release adds support for scanning S3 in addition to 210 | locally mounted filesystems. To improve performance, a 211 | default Celery worker will now work on 50 files, rather than 1. 212 | 213 | - [#49] Add support for scanning S3 214 | - [#51] Fix policy points for syncing directories 215 | - [#52] Remove list_dir option 216 | 217 | ## [v0.2.2] - 2018-09-10 218 | 219 | - [#50] fixed invocation used for collection events 220 | 221 | ## [v0.2.1] - 2018-09-06 222 | 223 | - [#45] check permission before enqueueing a file/dir 224 | - [#46] add missing scandir dependency 225 | - [#47] only call cancel if timer is instantiated 226 | 227 | ## [v0.2.0] - 2018-09-03 228 | 229 | - Swap queueing technology to Celery from RedisQueue 230 | - Handles non-utf8-encodeable filenames 231 | - Allows filetype/filename/directory exclusions 232 | - Adds performance profiler 233 | - Adds a NO_OP operation 234 | 235 | ## [v0.1.0] - 2018-05-11 236 | 237 | - Initial release 238 | - Python3 required 239 | - Includes five operations 240 | - Includes logging 241 | - Nascent support for Docker, Kubernetes, and Helm 242 | 243 | [Unreleased]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.8...HEAD 244 | [v0.3.8]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.7...v0.3.8 245 | [v0.3.7]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.6...v0.3.7 246 | [v0.3.6]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.5...v0.3.6 247 | [v0.3.5]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.4...v0.3.5 248 | [v0.3.4]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.3...v0.3.4 249 | [v0.3.3]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.2...v0.3.3 250 | [v0.3.2]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.1...v0.3.2 251 | [v0.3.1]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.0...v0.3.1 252 | [v0.3.0]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.2...v0.3.0 253 | [v0.2.2]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.1...v0.2.2 254 | [v0.2.1]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.0...v0.2.1 255 | [v0.2.0]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.1.0...v0.2.0 256 | [v0.1.0]: https://github.com/irods/irods_capability_automated_ingest/compare/11f9825df721a19dd25dad70aa94e5aa73d1d941...v0.1.0 257 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, The University of North Carolina at Chapel Hill 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | - Neither the name of the University of North Carolina at Chapel Hill nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS CHANGELOG.md LICENSE.txt README.md -------------------------------------------------------------------------------- /capability_automated_ingest_filesystem_scanner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/capability_automated_ingest_filesystem_scanner.jpg -------------------------------------------------------------------------------- /capability_automated_ingest_landing_zone.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/capability_automated_ingest_landing_zone.jpg -------------------------------------------------------------------------------- /docker/demo/README.md: -------------------------------------------------------------------------------- 1 | # Ingest Demo Compose Project 2 | 3 | **DO NOT USE THIS IN PRODUCTION!!** 4 | 5 | Use this Compose project to test out the ingest tool. There is a Docker volume shared between the iRODS service and the ingest workers that can be used for testing scans. There is another shared volume used to host the Minio storage. 6 | 7 | It's easiest to try out scanning things from the `ingest-celery-workers` service instance. 8 | 9 | ## Build 10 | 11 | ``` 12 | docker compose build 13 | ``` 14 | 15 | The `ingest-celery-workers` service has a build argument that allows for controlling the version of the ingest package. Here's how to use it: 16 | 17 | ``` 18 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=git+https://github.com/irods/irods_capability_automated_ingest@main 19 | ``` 20 | 21 | This will clone the specified git repository and checkout the commit-ish specified. You could also specify a released version: 22 | 23 | ``` 24 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=irods-capability-automated-ingest==0.4.2 25 | ``` 26 | 27 | If no `--build-arg` is specified, the default build will install the latest released version of the package from PyPI. The following is equivalent to not specifying a `--build-arg` when building the project: 28 | ``` 29 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=irods-capability-automated-ingest 30 | ``` 31 | 32 | ## Running the project 33 | 34 | This demo simply starts the services and leaves them running with the expectation that commands will be issued to them either through `docker exec` or via client requests to the various endpoints. 35 | 36 | It is a simple project, so starting and stopping it are straightforward. 37 | 38 | To bring the project up: 39 | 40 | ``` 41 | docker compose up 42 | ``` 43 | 44 | To bring the project down: 45 | 46 | ``` 47 | docker compose down 48 | ``` 49 | 50 | The other `docker compose` commands (`start`, `stop`, `restart`, etc.) should work as expected, as well. 51 | 52 | If you wish to adjust the Celery concurrency, modify the Compose YAML file to adjust the `command` run by the `ingest-celery-workers` service: 53 | ```yaml 54 | command: ["-c", "2"] # Adjust the "2" value to whatever concurrency you want 55 | ``` 56 | The `command` can only be adjusted before the container is created, so if you wish to adjust the concurrency after the project is already up, you will need to recreate the `ingest-celery-workers` service instance containers. 57 | 58 | ## Scanning an S3 bucket 59 | 60 | Change the port exposed by the `minio` service, if needed, so that the MinIO Console can be accessed. The MinIO server is being run with access key `irods` and secret key `irodsadmin`. The place from which the job is launched should have a keypair file with these credentials: 61 | ``` 62 | irods 63 | irodsadmin 64 | ``` 65 | 66 | To perform a basic scan of an S3 bucket called, for example, `ingest-test-bucket`, run something like the following: 67 | 68 | ``` 69 | python3 -m irods_capability_automated_ingest.irods_sync start \ 70 | /ingest-test-bucket \ 71 | /tempZone/home/rods/ingest-test-bucket \ 72 | --s3_keypair /path/to/s3keypair.txt \ 73 | --s3_endpoint_domain minio:19000 \ 74 | --s3_insecure_connection \ 75 | --synchronous \ 76 | --progress 77 | ``` 78 | 79 | It's easiest to try out scanning things from the `ingest-celery-workers` service instance. 80 | 81 | ## Performance testing 82 | 83 | While using Docker is not going to get you the best possible performance numbers, it can be useful for benchmarking certain tasks in a reproducible environment. 84 | 85 | This section will describe some interesting things you can do to test out various configurations for performance. 86 | 87 | ### Celery configuration 88 | 89 | As mentioned in other sections, the `concurrency` configuration can be changed before container creation for the `ingest-celery-workers` service by overriding the `command` in the Docker Compose YAML file. This affects the number of Celery workers in a given service instance. 90 | 91 | Celery has a number of other configurations for the workers which can help with performance: [https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker](https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker) 92 | 93 | ### Docker Compose service scaling 94 | 95 | The `ingest-celery-workers` service can be "scaled up" using the `--scale` option of `docker compose up`. The default scale is 1 service instance, but the scale can be adjusted like this: 96 | ```bash 97 | docker compose up --scale ingest-celery-workers=4 # replace 4 with desired number of instances 98 | ``` 99 | The above line will spawn 4 instances (containers) of the `ingest-celery-workers` service with each instance having a `concurrency` of whatever has been configured. With the default configuration, this would be 2, for a total of 8 workers across the 4 containers. This can even be done when the project is already up to scale the number of instances up without affecting the existing containers. This can of course be used to scale *down* the number of instances as well. 100 | 101 | ### Network manipulation with Traffic Control (`tc`) 102 | 103 | `tc` can be used to simulate network delays and other networking conditions that may not ordinarily be present. See the `tc` documentation for more information: [https://linux.die.net/man/8/tc](https://linux.die.net/man/8/tc) 104 | 105 | Network traffic manipulation requires enabling the additional capability `NET_ADMIN` in the target containers. Remember that "additional capabilities" can only be added at container creation. This can be done a number of different ways, but the simplest way for this project is to add the following `cap_add` stanza to the `ingest-celery-workers` service in the Docker Compose YAML file: 106 | ```yaml 107 | cap_add: 108 | - NET_ADMIN 109 | ``` 110 | 111 | Here are some useful commands to try executing inside the `ingest-celery-workers` service instance containers for manipulating network traffic: 112 | ```bash 113 | tc qdisc add dev eth0 root netem delay 100ms # to add rule 114 | tc qdisc show dev eth0 # to show rules 115 | tc qdisc del dev eth0 root netem # to delete rule 116 | ``` 117 | Note: In order to run `tc`, the proper package must be installed in the container(s) in which the command will be running. For most Linux distributions, this is `iproute2`. 118 | -------------------------------------------------------------------------------- /docker/demo/compose.yaml: -------------------------------------------------------------------------------- 1 | name: irods-ingest-demo 2 | 3 | services: 4 | redis: 5 | image: redis:7 6 | 7 | irods-catalog: 8 | build: 9 | context: irods_catalog 10 | environment: 11 | - POSTGRES_PASSWORD=testpassword 12 | 13 | irods-catalog-provider: 14 | build: 15 | context: irods_catalog_provider 16 | healthcheck: 17 | test: ["CMD", "su", "-", "irods", "-c", "./irodsctl status | grep Process"] 18 | interval: 10s 19 | timeout: 10s 20 | retries: 3 21 | start_period: 20s 22 | start_interval: 10s 23 | volumes: 24 | - shared_volume:/data/ufs 25 | depends_on: 26 | irods-catalog: 27 | condition: service_started 28 | 29 | ingest-celery-workers: 30 | build: 31 | context: ingest_celery_workers 32 | environment: 33 | - CELERY_BROKER_URL=redis://redis:6379/0 34 | - IRODS_PORT=1247 35 | - IRODS_HOST=irods-catalog-provider 36 | - IRODS_USER_NAME=rods 37 | - IRODS_ZONE_NAME=tempZone 38 | - IRODS_PASSWORD=rods 39 | volumes: 40 | - shared_volume:/data/ufs 41 | depends_on: 42 | redis: 43 | condition: service_started 44 | irods-catalog-provider: 45 | condition: service_healthy 46 | command: ["-c", "2", "--loglevel", "INFO", "-n", "ingest-demo"] # Configure Celery options here. Note: Only takes effect at container creation. 47 | 48 | minio: 49 | image: minio/minio:RELEASE.2024-09-13T20-26-02Z 50 | ports: 51 | - "19000:19000" # This is the port to use for issuing S3 requests. 52 | - "19001:19001" # Change this port, if needed, to access the MinIO console webpage. 53 | command: minio server /data/minio-s3 54 | environment: 55 | MINIO_ROOT_USER: irods 56 | MINIO_ROOT_PASSWORD: irodsadmin 57 | MINIO_ADDRESS: ":19000" 58 | MINIO_CONSOLE_ADDRESS: ":19001" 59 | 60 | volumes: 61 | shared_volume: 62 | -------------------------------------------------------------------------------- /docker/demo/ingest_celery_workers/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | ARG IRODS_AUTOMATED_INGEST_PIP_PACKAGE="irods-capability-automated-ingest" 4 | 5 | RUN pip install ${IRODS_AUTOMATED_INGEST_PIP_PACKAGE} 6 | 7 | ENTRYPOINT ["celery", "-A", "irods_capability_automated_ingest", "worker", "-Q", "restart,path,file"] 8 | 9 | # Override the command at runtime to adjust Celery concurrency and other options. 10 | CMD ["-c", "2"] 11 | -------------------------------------------------------------------------------- /docker/demo/irods_catalog/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:14 2 | 3 | COPY init-user-db.sh /docker-entrypoint-initdb.d/init-user-db.sh 4 | -------------------------------------------------------------------------------- /docker/demo/irods_catalog/init-user-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Adapted from "Initialization script" in documentation for official Postgres dockerhub: 4 | # https://hub.docker.com/_/postgres/ 5 | set -e 6 | 7 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL 8 | CREATE DATABASE "ICAT"; 9 | CREATE USER irods WITH PASSWORD 'testpassword'; 10 | GRANT ALL PRIVILEGES ON DATABASE "ICAT" to irods; 11 | EOSQL 12 | -------------------------------------------------------------------------------- /docker/demo/irods_catalog_provider/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y \ 7 | apt-transport-https \ 8 | gnupg \ 9 | wget \ 10 | && \ 11 | apt-get clean && \ 12 | rm -rf /var/lib/apt/lists/* /tmp/* 13 | 14 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add - && \ 15 | echo "deb [arch=amd64] https://packages.irods.org/apt/ jammy main" | tee /etc/apt/sources.list.d/renci-irods.list 16 | 17 | RUN apt-get update && \ 18 | apt-get install -y \ 19 | libcurl4-gnutls-dev \ 20 | python3 \ 21 | python3-distro \ 22 | python3-jsonschema \ 23 | python3-pip \ 24 | python3-psutil \ 25 | python3-requests \ 26 | rsyslog \ 27 | unixodbc \ 28 | && \ 29 | apt-get clean && \ 30 | rm -rf /var/lib/apt/lists/* /tmp/* 31 | 32 | RUN apt-get update && \ 33 | apt-get install -y \ 34 | irods-database-plugin-postgres \ 35 | irods-runtime \ 36 | irods-server \ 37 | && \ 38 | apt-get clean && \ 39 | rm -rf /var/lib/apt/lists/* /tmp/* 40 | 41 | COPY setup.input / 42 | RUN mv /setup.input /irods_setup.input 43 | 44 | COPY entrypoint.sh / 45 | RUN chmod u+x /entrypoint.sh 46 | ENTRYPOINT ["/entrypoint.sh"] 47 | -------------------------------------------------------------------------------- /docker/demo/irods_catalog_provider/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -e 2 | 3 | catalog_db_hostname=irods-catalog 4 | 5 | echo "Waiting for iRODS catalog database to be ready" 6 | 7 | until pg_isready -h ${catalog_db_hostname} -d ICAT -U irods -q 8 | do 9 | sleep 1 10 | done 11 | 12 | echo "iRODS catalog database is ready" 13 | 14 | setup_input_file=/irods_setup.input 15 | 16 | if [ -e "${setup_input_file}" ]; then 17 | echo "Running iRODS setup" 18 | python3 /var/lib/irods/scripts/setup_irods.py < "${setup_input_file}" 19 | rm /irods_setup.input 20 | fi 21 | 22 | echo "Starting server" 23 | 24 | cd /usr/sbin 25 | su irods -c 'bash -c "./irodsServer -u"' 26 | -------------------------------------------------------------------------------- /docker/demo/irods_catalog_provider/setup.input: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | irods-catalog 6 | 5432 7 | ICAT 8 | irods 9 | y 10 | testpassword 11 | 12 | y 13 | demoResc 14 | 15 | tempZone 16 | 1247 17 | 20000 18 | 20199 19 | 1248 20 | 21 | rods 22 | y 23 | TEMPORARY_ZONE_KEY 24 | 32_byte_server_negotiation_key__ 25 | 32_byte_server_control_plane_key 26 | rods 27 | 28 | 29 | -------------------------------------------------------------------------------- /docker/ingest-test/README.md: -------------------------------------------------------------------------------- 1 | # How to run the test suite using docker-compose 2 | 3 | ## Step 1: Build the images 4 | 5 | Run the following to build the required images: 6 | ``` 7 | docker compose build 8 | ``` 9 | When testing against an alternative version of iRODS, there are three variables in the `docker-compose.yml` file which must be changed prior to the build step. For example, if testing against iRODS 4.3.3: 10 | ``` 11 | irods-catalog-provider: 12 | build: 13 | args: 14 | irods_version: 4.3.3-0~jammy 15 | irods_version_major_minor: 4.3 16 | py_version: 3 17 | ``` 18 | Note that, depending on whether the iRODS major/minor version is 4.2 or 4.3, the `py_version` takes on the possible values `""` or `"3"`, respectively. 19 | 20 | ## Step 2: Run the project 21 | 22 | Bring up the docker-compose project and the test suite will run on its own: 23 | ``` 24 | docker compose --env-file icommands.env up 25 | ``` 26 | The test suite is one of the services of the docker-compose project, so it will run on its own. The container is tied to the tests running, so it will exit once completed. 27 | The `--env-file` option is required in order to correctly configure the environment for the tests. 28 | 29 | ## Step 3: Bring down the project 30 | 31 | The project is not made to come down by itself (yet), so it has to be brought down after each run: 32 | ``` 33 | docker compose down 34 | ``` 35 | -------------------------------------------------------------------------------- /docker/ingest-test/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | some-redis: 5 | image: redis 6 | hostname: redis 7 | networks: 8 | default: 9 | aliases: 10 | - redis 11 | 12 | icat: 13 | build: 14 | context: icat 15 | args: 16 | postgres_password: testpassword 17 | hostname: catalog.example.org 18 | networks: 19 | default: 20 | aliases: 21 | - catalog.example.org 22 | 23 | irods-catalog-provider: 24 | build: 25 | context: provider 26 | args: 27 | irods_version: 4.3.3-0~jammy 28 | irods_version_major_minor: 4.3 29 | py_version: 3 30 | hostname: icat.example.org 31 | networks: 32 | default: 33 | aliases: 34 | - icat.example.org 35 | volumes: 36 | - shared_volume:/data/ufs 37 | depends_on: 38 | - icat 39 | 40 | ingest-test: 41 | build: 42 | context: test 43 | environment: 44 | - "PIP_PACKAGE" 45 | - "TEST_CASE" 46 | - "IRODS_PORT" 47 | - "IRODS_HOST" 48 | - "IRODS_USER_NAME" 49 | - "IRODS_ZONE_NAME" 50 | - "IRODS_ENVIRONMENT_FILE" 51 | - "IRODS_PASSWORD" 52 | volumes: 53 | - shared_volume:/data/ufs 54 | depends_on: 55 | - some-redis 56 | - irods-catalog-provider 57 | 58 | minio: 59 | image: minio/minio:RELEASE.2024-09-13T20-26-02Z 60 | ports: 61 | - "19000:19000" # This is the port to use for issuing S3 requests. 62 | - "19001:19001" # Change this port, if needed, to access the MinIO console webpage. 63 | command: minio server /data/minio-s3 64 | environment: 65 | MINIO_ROOT_USER: irods 66 | MINIO_ROOT_PASSWORD: irodsadmin 67 | MINIO_ADDRESS: ":19000" 68 | MINIO_CONSOLE_ADDRESS: ":19001" 69 | 70 | volumes: 71 | shared_volume: 72 | -------------------------------------------------------------------------------- /docker/ingest-test/icat/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:11 2 | 3 | ARG postgres_password 4 | ENV POSTGRES_PASSWORD ${postgres_password} 5 | 6 | COPY postgres_init.sh /docker-entrypoint-initdb.d/ 7 | -------------------------------------------------------------------------------- /docker/ingest-test/icat/postgres_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL 5 | CREATE USER irods with password 'testpassword'; 6 | CREATE DATABASE "ICAT"; 7 | GRANT ALL PRIVILEGES ON DATABASE "ICAT" TO irods; 8 | EOSQL 9 | -------------------------------------------------------------------------------- /docker/ingest-test/icommands.env: -------------------------------------------------------------------------------- 1 | IRODS_PORT=1247 2 | IRODS_HOST=icat.example.org 3 | IRODS_USER_NAME=rods 4 | IRODS_ZONE_NAME=tempZone 5 | IRODS_ENVIRONMENT_FILE=/irods_environment.json 6 | IRODS_PASSWORD=rods 7 | PIP_PACKAGE=git+https://github.com/irods/irods_capability_automated_ingest@main 8 | TEST_CASE="irods_capability_automated_ingest.test.test_s3_bucket_scan irods_capability_automated_ingest.test.test_delete_modes irods_capability_automated_ingest.test.test_irods_sync" 9 | -------------------------------------------------------------------------------- /docker/ingest-test/provider/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # iRODS Provider Image. 3 | # 4 | FROM ubuntu:22.04 5 | ARG irods_version 6 | ARG irods_version_major_minor 7 | ARG py_version 8 | ENV PY_VERSION="${py_version}" 9 | 10 | ENV DEBIAN_FRONTEND=noninteractive 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y \ 14 | apt-transport-https \ 15 | gnupg \ 16 | wget \ 17 | && \ 18 | apt-get clean && \ 19 | rm -rf /var/lib/apt/lists/* /tmp/* 20 | 21 | # Install pre-requisites 22 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add - && \ 23 | echo "deb [arch=amd64] https://packages.irods.org/apt/ jammy main" | tee /etc/apt/sources.list.d/renci-irods.list 24 | 25 | RUN apt-get update && \ 26 | apt-get install -y \ 27 | libcurl4-gnutls-dev \ 28 | python3 \ 29 | python3-distro \ 30 | python3-jsonschema \ 31 | python3-pip \ 32 | python3-psutil \ 33 | python3-requests \ 34 | rsyslog \ 35 | unixodbc \ 36 | && \ 37 | apt-get clean && \ 38 | rm -rf /var/lib/apt/lists/* /tmp/* 39 | 40 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add -; \ 41 | echo "deb [arch=amd64] https://packages.irods.org/apt/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/renci-irods.list; \ 42 | apt-get update && \ 43 | apt-get install -y \ 44 | irods-runtime=${irods_version} \ 45 | irods-icommands=${irods_version} \ 46 | irods-server=${irods_version} \ 47 | irods-database-plugin-postgres=${irods_version} 48 | 49 | # Set command to execute when launching the container. 50 | COPY --chmod=755 start_provider.sh / 51 | COPY irods_${irods_version_major_minor}_provider.input /irods_provider.input 52 | ENTRYPOINT ["./start_provider.sh"] 53 | -------------------------------------------------------------------------------- /docker/ingest-test/provider/db_commands.txt: -------------------------------------------------------------------------------- 1 | CREATE DATABASE "ICAT"; 2 | CREATE USER irods WITH PASSWORD 'testpassword'; 3 | GRANT ALL PRIVILEGES ON DATABASE "ICAT" to irods; 4 | \q 5 | -------------------------------------------------------------------------------- /docker/ingest-test/provider/irods_4.2_provider.input: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | catalog.example.org 6 | 5432 7 | ICAT 8 | irods 9 | y 10 | testpassword 11 | 12 | tempZone 13 | 1247 14 | 20000 15 | 20199 16 | 1248 17 | 18 | rods 19 | y 20 | TEMPORARY_ZONE_KEY 21 | 32_byte_server_negotiation_key__ 22 | 32_byte_server_control_plane_key 23 | rods 24 | 25 | 26 | -------------------------------------------------------------------------------- /docker/ingest-test/provider/irods_4.3_provider.input: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | catalog.example.org 6 | 5432 7 | ICAT 8 | irods 9 | y 10 | testpassword 11 | 12 | y 13 | demoResc 14 | /var/lib/irods/Vault 15 | tempZone 16 | 1247 17 | 20000 18 | 20199 19 | 1248 20 | 21 | rods 22 | y 23 | TEMPORARY_ZONE_KEY 24 | 32_byte_server_negotiation_key__ 25 | 32_byte_server_control_plane_key 26 | rods 27 | 28 | 29 | -------------------------------------------------------------------------------- /docker/ingest-test/provider/start_provider.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Start the Postgres database. 4 | counter=0 5 | until pg_isready -h catalog.example.org -d ICAT -U irods -q 6 | do 7 | sleep 1 8 | ((counter += 1)) 9 | done 10 | echo Postgres took approximately $counter seconds to fully start ... 11 | 12 | # Set up iRODS if not already done 13 | if [ ! -e /var/lib/irods/setup_complete ] 14 | then 15 | python${PY_VERSION} /var/lib/irods/scripts/setup_irods.py < /irods_provider.input 16 | fi 17 | 18 | # run the server 19 | su - irods -c "/var/lib/irods/irodsctl restart" 20 | 21 | touch /var/lib/irods/setup_complete 22 | 23 | # Keep container running if the test fails. 24 | tail -f /dev/null 25 | # Is this better? sleep 2147483647d 26 | 27 | -------------------------------------------------------------------------------- /docker/ingest-test/run_tests.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e 4 | 5 | docker-compose --env-file icommands.env up & 6 | 7 | until [ $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) ]; do 8 | #echo "waiting for container to exist" 9 | sleep 1 10 | done 11 | 12 | while [ ! $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) == "running" ]; do 13 | #echo "waiting for container to run" 14 | sleep 1 15 | done 16 | 17 | #echo "test container is up" 18 | 19 | while [ $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) == "running" ]; do 20 | #echo "waiting for tests to finish" 21 | sleep 1 22 | done 23 | 24 | docker-compose down 25 | 26 | -------------------------------------------------------------------------------- /docker/ingest-test/test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | RUN apt update && apt install -y netcat-traditional 4 | 5 | COPY irods_environment.json / 6 | 7 | ENV TEST_CASE=${TEST_CASE} 8 | 9 | COPY run_tests.sh / 10 | RUN chmod u+x /run_tests.sh 11 | ENTRYPOINT ["./run_tests.sh"] 12 | -------------------------------------------------------------------------------- /docker/ingest-test/test/Dockerfile.pure: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | 3 | ARG PIP_PACKAGE="irods-capability-automated-ingest" 4 | 5 | RUN pip install ${PIP_PACKAGE} 6 | 7 | COPY irods_environment.json / 8 | 9 | ENV TEST_CASE=${TEST_CASE} 10 | 11 | ENTRYPOINT python -m unittest ${TEST_CASE:-irods_capability_automated_ingest.test.test_irods_sync} 12 | 13 | #FROM ingest:latest 14 | #ENV TEST_CASE=${TEST_CASE} 15 | #ENTRYPOINT python -m unittest ${TEST_CASE:-irods_capability_automated_ingest.test.test_irods_sync} 16 | -------------------------------------------------------------------------------- /docker/ingest-test/test/irods_environment.json: -------------------------------------------------------------------------------- 1 | { 2 | "irods_host": "icat.example.org", 3 | "irods_port": 1247, 4 | "irods_user": "rods", 5 | "irods_zone_name": "tempZone" 6 | } 7 | -------------------------------------------------------------------------------- /docker/ingest-test/test/run_tests.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -ex 2 | 3 | pip install ${PIP_PACKAGE} 4 | 5 | # Wait until the provider is up and accepting connections. 6 | until nc -z icat.example.org 1247; do 7 | sleep 1 8 | done 9 | 10 | sleep 10 11 | 12 | python -m unittest -v ${TEST_CASE} 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/__init__.py -------------------------------------------------------------------------------- /irods_capability_automated_ingest/celery.py: -------------------------------------------------------------------------------- 1 | from . import custom_event_handler, sync_logging 2 | 3 | from celery import Celery 4 | from celery.signals import task_prerun, task_postrun 5 | 6 | import traceback 7 | 8 | app = Celery("irods_capability_automated_ingest") 9 | 10 | app.conf.update( 11 | include=[ 12 | "irods_capability_automated_ingest.tasks.delete_tasks", 13 | "irods_capability_automated_ingest.tasks.filesystem_tasks", 14 | "irods_capability_automated_ingest.tasks.s3_bucket_tasks", 15 | ] 16 | ) 17 | 18 | 19 | @task_prerun.connect() 20 | def task_prerun(task_id=None, task=None, args=None, kwargs=None, **kw): 21 | meta = args[0] 22 | if meta["profile"]: 23 | config = meta["config"] 24 | profile_log = config.get("profile") 25 | logger = sync_logging.get_sync_logger(profile_log) 26 | logger.info( 27 | "task_prerun", 28 | event_id=task_id, 29 | event_name=task.name, 30 | path=meta.get("path"), 31 | target=meta.get("target"), 32 | hostname=task.request.hostname, 33 | index=current_process().index, 34 | ) 35 | 36 | 37 | @task_postrun.connect() 38 | def task_postrun( 39 | task_id=None, task=None, args=None, kwargs=None, retval=None, state=None, **kw 40 | ): 41 | meta = args[0] 42 | if meta["profile"]: 43 | config = meta["config"] 44 | profile_log = config.get("profile") 45 | logger = sync_logging.get_sync_logger(profile_log) 46 | logger.info( 47 | "task_postrun", 48 | event_id=task_id, 49 | event_name=task.name, 50 | path=meta.get("path"), 51 | target=meta.get("target"), 52 | hostname=task.request.hostname, 53 | index=current_process().index, 54 | state=state, 55 | ) 56 | 57 | 58 | class RestartTask(app.Task): 59 | def on_failure(self, exc, task_id, args, kwargs, einfo): 60 | meta = args[0] 61 | config = meta["config"] 62 | job_name = meta["job_name"] 63 | logger = sync_logging.get_sync_logger(config["log"]) 64 | logger.error( 65 | "failed_restart", 66 | path=meta["path"], 67 | job_name=job_name, 68 | task_id=task_id, 69 | exc=exc, 70 | einfo=einfo, 71 | traceback=traceback.extract_tb(exc.__traceback__), 72 | ) 73 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/char_map_util.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import logging 4 | import re 5 | import collections 6 | import string 7 | import functools 8 | 9 | __all__ = ["translate_path"] 10 | _regex = re.compile("") 11 | 12 | # use as a key in the character mapping to ensure a regex matches to exactly one character 13 | 14 | 15 | def _re_wrapper(regex): 16 | return lambda ch: type(ch) is str and len(ch) == 1 and regex.match(ch) 17 | 18 | 19 | _string_replace = lambda _string, _map: _string.translate( 20 | {ord(k): v for k, v in _map.items() if v is not None} 21 | ) 22 | 23 | _logger = logging.getLogger("char_map_util") 24 | 25 | SEPARATORS = [c for c in "-~_"] 26 | Allowed = [ 27 | {}, 28 | { 29 | "separators": "".join(SEPARATORS), 30 | "radixchars": string.digits + string.ascii_letters, 31 | "punctuation": "".join( 32 | sorted(set(string.punctuation) - set(["/"] + SEPARATORS)) 33 | ), 34 | }, 35 | ] 36 | 37 | 38 | def _allowed_in_string(s, map_fn): 39 | s_new = translate_string(s, map_fn) 40 | return "".join(a for a, b in zip(s, s_new) if a == b) 41 | 42 | 43 | class InvalidUsage(Exception): 44 | pass 45 | 46 | 47 | def _update_Allowed(map_fn=None): 48 | if len(Allowed) == 2: 49 | if map_fn is None: 50 | raise InvalidUsage( 51 | "The first call to this function needs a dictionary in map_fn" 52 | ) 53 | d = Allowed.pop() 54 | Allowed[0].update((k, _allowed_in_string(v, map_fn)) for k, v in d.items()) 55 | return Allowed[0] 56 | 57 | 58 | def _allowed_of_type(key, map_fn=None): 59 | return _update_Allowed(map_fn)[key] 60 | 61 | 62 | _fb_hash = hashlib.sha224 63 | _fb_obj = _fb_hash().digest() 64 | 65 | 66 | def _fallback(name=None): 67 | if name is None: 68 | return _fb_obj 69 | else: 70 | h = _fb_hash() 71 | h.update(name.encode("utf8")) 72 | return h.digest() 73 | 74 | 75 | _change_encoding_test = lambda c: c 76 | _change_encoding_default = lambda c: ( 77 | chr(c).encode("utf8") if type(c) is int else c.encode("utf8") 78 | ) 79 | 80 | 81 | # must be called after first use of _encoded_differences() 82 | def _diffs_encoded_to_suffix(diff_bytes, rxarray=None): 83 | if not diff_bytes: 84 | return "" 85 | number = functools.reduce((lambda a, b: (a << 8) | b), diff_bytes) 86 | radixrep = _update_Allowed()["separators"][:1] 87 | if rxarray is None: 88 | rxarray = _update_Allowed()["radixchars"] 89 | L = len(rxarray) 90 | while number: 91 | number, mod = divmod(number, L) 92 | radixrep += rxarray[mod] 93 | return radixrep 94 | 95 | 96 | def translate_string(s, mp): 97 | if not isinstance(mp, dict): 98 | mp = collections.OrderedDict(mp) 99 | for key, value in mp.items(): 100 | if isinstance(key, tuple): 101 | s = _string_replace(s, {k: value for k in key}) 102 | elif isinstance(key, _regex.__class__): 103 | s = key.sub(value, s) 104 | elif isinstance(key, str): 105 | s = _string_replace(s, {key: value}) 106 | elif callable(key): 107 | s = "".join(value if key(c) else c for c in s) 108 | return s 109 | 110 | 111 | def _encoded_differences(filename, MapFn=None, xfunc=_change_encoding_default): 112 | rx = _allowed_of_type("radixchars", map_fn=MapFn) 113 | newname = translate_string(filename, MapFn) 114 | gen = ( 115 | (tuple(xfunc(_) for _ in a), b) 116 | for a, b in zip(enumerate(filename), newname) 117 | if a[1] != b 118 | ) 119 | MaxBytes = len(_fallback()) 120 | encoded_change = b"" 121 | if xfunc is _change_encoding_test: 122 | return list(gen) 123 | # Generate suffix from encoded changes or the constant length SHA2 digest, whichever is shorter. 124 | while True: 125 | try: 126 | g = next(gen) 127 | except StopIteration: 128 | break 129 | encoded_change += b"".join(g[0]) 130 | if len(encoded_change) >= MaxBytes: 131 | _logger.warning("Using SHA2 for {filename=}") 132 | return newname, _fallback(filename) 133 | return newname, encoded_change 134 | 135 | 136 | def translate_path_element(filename, map_fn, use_suffix=True): 137 | newname, enc_diffs = _encoded_differences(filename, map_fn) 138 | if use_suffix: 139 | suffix = _diffs_encoded_to_suffix(enc_diffs) 140 | return newname + suffix 141 | else: 142 | return newname 143 | 144 | 145 | def translate_path(path, mp, translate_function=translate_path_element): 146 | t_elem = [] 147 | for el in path.split("/"): 148 | if el == "": 149 | if not t_elem: 150 | t_elem.append("") 151 | continue 152 | new_el = translate_function(el, mp) 153 | t_elem.append(new_el) 154 | return "/".join(t_elem) 155 | 156 | 157 | if __name__ == "__main__": 158 | # Demonstration 159 | map_fn = ( 160 | [("!", "~", "0", "1"), "_"], # map your choice of things to an underscore 161 | [re.compile("[\u0100-\U00101fff]"), "~"], 162 | ) # map all non-ascii unicode to a tilde 163 | m = _update_Allowed(map_fn) 164 | import pprint 165 | 166 | pprint.pprint(m) 167 | newname, enc_diffs = _encoded_differences( 168 | "#041!2~93#041!2\u00ff9\U00101010Z", map_fn 169 | ) 170 | suffix = _diffs_encoded_to_suffix(enc_diffs) 171 | print(f"newname={newname}\nsuffix={suffix}") 172 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/core.py: -------------------------------------------------------------------------------- 1 | class Core(object): 2 | @classmethod 3 | def on_data_obj_create(cls, func, *args, **options): 4 | if hasattr(cls, "pre_data_obj_create"): 5 | cls.pre_data_obj_create(*args, **options) 6 | 7 | func(*args, **options) 8 | 9 | if hasattr(cls, "post_data_obj_create"): 10 | cls.post_data_obj_create(*args, **options) 11 | 12 | @classmethod 13 | def on_data_obj_modify(cls, func, *args, **options): 14 | if hasattr(cls, "pre_data_obj_modify"): 15 | cls.pre_data_obj_modify(*args, **options) 16 | 17 | func(*args, **options) 18 | 19 | if hasattr(cls, "post_data_obj_modify"): 20 | cls.post_data_obj_modify(*args, **options) 21 | 22 | @classmethod 23 | def on_data_obj_delete(cls, func, *args, **options): 24 | if hasattr(cls, "pre_data_obj_delete"): 25 | cls.pre_data_obj_delete(*args, **options) 26 | 27 | func(*args, **options) 28 | 29 | if hasattr(cls, "post_data_obj_delete"): 30 | cls.post_data_obj_delete(*args, **options) 31 | 32 | @classmethod 33 | def on_coll_create(cls, func, *args, **options): 34 | if hasattr(cls, "pre_coll_create"): 35 | cls.pre_coll_create(*args, **options) 36 | 37 | func(*args, **options) 38 | 39 | if hasattr(cls, "post_coll_create"): 40 | cls.post_coll_create(*args, **options) 41 | 42 | @classmethod 43 | def on_coll_modify(cls, func, *args, **options): 44 | if hasattr(cls, "pre_coll_modify"): 45 | cls.pre_coll_modify(*args, **options) 46 | 47 | func(*args, **options) 48 | 49 | if hasattr(cls, "post_coll_modify"): 50 | cls.post_coll_modify(*args, **options) 51 | 52 | @classmethod 53 | def on_coll_delete(cls, func, *args, **options): 54 | if hasattr(cls, "pre_coll_delete"): 55 | cls.pre_coll_delete(*args, **options) 56 | 57 | func(*args, **options) 58 | 59 | if hasattr(cls, "post_coll_delete"): 60 | cls.post_coll_delete(*args, **options) 61 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/custom_event_handler.py: -------------------------------------------------------------------------------- 1 | from .redis_key import redis_key_handle 2 | from .redis_utils import get_redis 3 | 4 | import importlib 5 | import os.path 6 | import sys 7 | 8 | 9 | class custom_event_handler(object): 10 | def __init__(self, meta): 11 | self.meta = meta.copy() 12 | self.logger = self.meta["config"]["log"] 13 | 14 | def get_module(self, rtn_mod_and_class=False): # get_ev_handler_class or something 15 | r = get_redis(self.meta["config"]) 16 | key = "event_handler" 17 | 18 | job_name = self.meta["job_name"] 19 | 20 | # reconstructing redis key from meta 21 | event_handler_key_str = self.meta["event_handler_key"] 22 | event_handler_split = event_handler_key_str.split(":/") 23 | event_handler_key = redis_key_handle( 24 | r, event_handler_split[0], event_handler_split[1] 25 | ) 26 | 27 | content_string = event_handler_key.get_value() 28 | 29 | # getting uuid for file construction 30 | event_handler_str = event_handler_key.get_key().split("::") 31 | uuid_ = event_handler_str[1] 32 | 33 | eh_file_name = "event_handler" + job_name.replace(".", "__") + uuid_ 34 | eh_path = "/tmp/" + eh_file_name + ".py" 35 | 36 | # if the file does not already exist, create new file 37 | if not (os.path.isfile(eh_path)): 38 | with open(eh_path, "w") as eh: 39 | eh.write(content_string.decode("utf-8")) 40 | 41 | # import event_handler module 42 | if "/tmp" not in sys.path: 43 | sys.path.insert(0, "/tmp") 44 | mod = importlib.import_module(eh_file_name) 45 | if mod is None: 46 | return (None, None) if rtn_mod_and_class else None 47 | 48 | cls = getattr(mod, key, None) 49 | if rtn_mod_and_class: 50 | return (mod, cls) 51 | 52 | return cls 53 | 54 | def hasattr(self, attr): 55 | module = self.get_module() 56 | return module is not None and hasattr(module, attr) 57 | 58 | def call(self, hdlr, logger, func, *args, **options): 59 | (mod, cls) = self.get_module(rtn_mod_and_class=True) 60 | args = (mod,) + tuple(args) 61 | 62 | if self.hasattr(hdlr): 63 | logger.debug( 64 | "calling [" 65 | + hdlr 66 | + "] in event handler: args = " 67 | + str(args) 68 | + ", options = " 69 | + str(options) 70 | ) 71 | getattr(cls, hdlr)(func, *args, **options) 72 | else: 73 | func(*args, **options) 74 | 75 | # attribute getters 76 | def max_retries(self): 77 | if self.hasattr("max_retries"): 78 | module = self.get_module() 79 | return module.max_retries(module, self.logger, self.meta) 80 | return 0 81 | 82 | def timeout(self): 83 | if self.hasattr("timeout"): 84 | module = self.get_module() 85 | return module.timeout(module, self.logger, self.meta) 86 | return 3600 87 | 88 | def delay(self, retries): 89 | if self.hasattr("delay"): 90 | module = self.get_module() 91 | return module.delay(module, self.logger, self.meta, retries) 92 | return 0 93 | 94 | def operation(self, session, **options): 95 | if self.hasattr("operation"): 96 | return self.get_module().operation(session, self.meta, **options) 97 | 98 | from .utils import Operation 99 | 100 | return Operation.REGISTER_SYNC 101 | # return None 102 | 103 | def to_resource(self, session, **options): 104 | if self.hasattr("to_resource"): 105 | return self.get_module().to_resource(session, self.meta, **options) 106 | return None 107 | 108 | def target_path(self, session, **options): 109 | if self.hasattr("target_path"): 110 | return self.get_module().target_path(session, self.meta, **options) 111 | return None 112 | 113 | def delete_mode(self): 114 | if self.hasattr("delete_mode"): 115 | return self.get_module().delete_mode(self.meta) 116 | 117 | from .utils import DeleteMode 118 | 119 | return DeleteMode.DO_NOT_DELETE 120 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/examples/__init__.py -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/append.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.PUT_APPEND 9 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/append_non_leaf_non_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_APPEND 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/append_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2Root" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_APPEND 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/append_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2a" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_APPEND 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/coll_create_pre_and_post.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from irods_capability_automated_ingest.core import Core 4 | from irods_capability_automated_ingest.utils import Operation 5 | 6 | OPERATION = Operation.REGISTER_SYNC 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return OPERATION 13 | 14 | @staticmethod 15 | def pre_coll_create(hdlr_mod, logger, session, meta, *args, **options): 16 | created_collection = meta["target"] 17 | parent_of_created_collection = "/".join(created_collection.split("/")[:-1]) 18 | 19 | attribute = "pre_coll_create" 20 | value = created_collection 21 | unit = OPERATION.name 22 | 23 | coll = session.collections.get(parent_of_created_collection) 24 | coll.metadata.add(attribute, value, unit) 25 | 26 | @staticmethod 27 | def post_coll_create(hdlr_mod, logger, session, meta, *args, **options): 28 | created_collection = meta["target"] 29 | parent_of_created_collection = "/".join(created_collection.split("/")[:-1]) 30 | 31 | attribute = "post_coll_create" 32 | value = created_collection 33 | unit = OPERATION.name 34 | 35 | coll = session.collections.get(parent_of_created_collection) 36 | coll.metadata.add(attribute, value, unit) 37 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/coll_modify_pre_and_post.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from irods_capability_automated_ingest.core import Core 4 | from irods_capability_automated_ingest.utils import Operation 5 | 6 | OPERATION = Operation.REGISTER_SYNC 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return OPERATION 13 | 14 | @staticmethod 15 | def pre_coll_modify(hdlr_mod, logger, session, meta, *args, **options): 16 | modified_collection = meta["target"] 17 | 18 | attribute = "pre_coll_modify" 19 | value = meta["job_name"] 20 | unit = OPERATION.name 21 | 22 | coll = session.collections.get(modified_collection) 23 | coll.metadata.add(attribute, value, unit) 24 | 25 | @staticmethod 26 | def post_coll_modify(hdlr_mod, logger, session, meta, *args, **options): 27 | modified_collection = meta["target"] 28 | 29 | attribute = "post_coll_modify" 30 | value = meta["job_name"] 31 | unit = OPERATION.name 32 | 33 | coll = session.collections.get(modified_collection) 34 | coll.metadata.add(attribute, value, unit) 35 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/data_obj_create_pre_and_post.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from irods_capability_automated_ingest.core import Core 4 | from irods_capability_automated_ingest.utils import Operation 5 | 6 | OPERATION = Operation.REGISTER_SYNC 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return OPERATION 13 | 14 | @staticmethod 15 | def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 16 | created_data_object_path = meta["target"] 17 | parent_collection_of_created_data_object = "/".join( 18 | created_data_object_path.split("/")[:-1] 19 | ) 20 | 21 | attribute = "pre_data_obj_create" 22 | value = created_data_object_path 23 | unit = OPERATION.name 24 | 25 | coll = session.collections.get(parent_collection_of_created_data_object) 26 | coll.metadata.add(attribute, value, unit) 27 | 28 | @staticmethod 29 | def post_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 30 | created_data_object_path = meta["target"] 31 | parent_collection_of_created_data_object = "/".join( 32 | created_data_object_path.split("/")[:-1] 33 | ) 34 | 35 | attribute = "post_data_obj_create" 36 | value = created_data_object_path 37 | unit = OPERATION.name 38 | 39 | coll = session.collections.get(parent_collection_of_created_data_object) 40 | coll.metadata.add(attribute, value, unit) 41 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/data_obj_modify_pre_and_post.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from irods_capability_automated_ingest.core import Core 4 | from irods_capability_automated_ingest.utils import Operation 5 | 6 | OPERATION = Operation.REGISTER_SYNC 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return OPERATION 13 | 14 | @staticmethod 15 | def pre_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options): 16 | created_data_object_path = meta["target"] 17 | parent_collection_of_created_data_object = "/".join( 18 | created_data_object_path.split("/")[:-1] 19 | ) 20 | 21 | attribute = "pre_data_obj_modify" 22 | value = created_data_object_path 23 | unit = OPERATION.name 24 | 25 | coll = session.collections.get(parent_collection_of_created_data_object) 26 | coll.metadata.add(attribute, value, unit) 27 | 28 | @staticmethod 29 | def post_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options): 30 | created_data_object_path = meta["target"] 31 | parent_collection_of_created_data_object = "/".join( 32 | created_data_object_path.split("/")[:-1] 33 | ) 34 | 35 | attribute = "post_data_obj_modify" 36 | value = created_data_object_path 37 | unit = OPERATION.name 38 | 39 | coll = session.collections.get(parent_collection_of_created_data_object) 40 | coll.metadata.add(attribute, value, unit) 41 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/metadata.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | from irods.meta import iRODSMeta 4 | import os 5 | 6 | filesystem_mode = "filesystem::mode" 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def post_data_obj_create(hdlr_mod, logger, session, meta, **options): 12 | target = meta["target"] 13 | path = meta["path"] 14 | s = os.stat(path) 15 | mode = s.st_mode 16 | 17 | obj = session.data_objects.get(target) 18 | obj.metadata.add(filesystem_mode, str(mode), "") 19 | 20 | @staticmethod 21 | def post_data_obj_modify(hdlr_mod, logger, session, meta, **options): 22 | target = meta["target"] 23 | path = meta["path"] 24 | s = os.stat(path) 25 | mode = s.st_mode 26 | obj = session.data_objects.get(target) 27 | obj.metadata[filesystem_mode] = iRODSMeta(filesystem_mode, str(mode)) 28 | 29 | @staticmethod 30 | def operation(session, meta, **options): 31 | return Operation.REGISTER_SYNC 32 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/no_op.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.NO_OP 9 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/no_retry.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | from irods_capability_automated_ingest.redis_utils import get_redis 4 | 5 | 6 | class event_handler(Core): 7 | @staticmethod 8 | def operation(session, meta, **options): 9 | return Operation.NO_OP 10 | 11 | @staticmethod 12 | def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 13 | target = meta["target"] 14 | path = meta["path"] 15 | 16 | r = get_redis(meta["config"]) 17 | failures = r.get("failures:" + path) 18 | if failures is None: 19 | failures = 0 20 | 21 | r.incr("failures:" + path) 22 | 23 | if failures == 0: 24 | raise RuntimeError("no failures") 25 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/post_job.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.NO_OP 9 | 10 | @staticmethod 11 | def post_job(hdlr_mod, logger, meta): 12 | # Amend here for testing so that we can ensure that post_job executes once per job. 13 | with open("/tmp/a", "a") as f: 14 | f.write("post_job") 15 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/pre_job.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.NO_OP 9 | 10 | @staticmethod 11 | def pre_job(hdlr_mod, logger, meta): 12 | with open("/tmp/a", "w") as f: 13 | f.write("pre_job") 14 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/put.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.PUT 9 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/put_non_leaf_non_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/put_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2Root" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/put_using_char_map.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | import re 4 | 5 | 6 | class event_handler(Core): 7 | re_non_alphanum = re.compile("[^a-zA-Z0-9]") 8 | 9 | @staticmethod 10 | def character_map(): 11 | return { 12 | event_handler.re_non_alphanum: "_" 13 | } # map any non-ascii or non-alphanumeric 14 | # character to '_' 15 | 16 | @staticmethod 17 | def operation(session, meta, **options): 18 | return Operation.PUT 19 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/put_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2a" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.REGISTER_SYNC 9 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register_non_leaf_non_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.REGISTER_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2Root" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.REGISTER_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register_using_char_map.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | import re 4 | 5 | 6 | class event_handler(Core): 7 | re_non_alphanum = re.compile("[^a-zA-Z0-9]") 8 | 9 | @staticmethod 10 | def character_map(): 11 | return { 12 | event_handler.re_non_alphanum: "_" 13 | } # map any non-ascii or non-alphanumeric 14 | # character to '_' 15 | 16 | @staticmethod 17 | def operation(session, meta, **options): 18 | return Operation.REGISTER_SYNC 19 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register_with_peps.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.REGISTER_SYNC 9 | 10 | @staticmethod 11 | def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 12 | logical_path = meta["target"] 13 | logger.info("pre_data_obj_create:[" + logical_path + "]") 14 | 15 | @staticmethod 16 | def post_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 17 | logical_path = meta["target"] 18 | logger.info("post_data_obj_create:[" + logical_path + "]") 19 | 20 | @staticmethod 21 | def pre_coll_create(hdlr_mod, logger, session, meta, *args, **options): 22 | logical_path = meta["target"] 23 | logger.info("pre_coll_create:[" + logical_path + "]") 24 | 25 | @staticmethod 26 | def post_coll_create(hdlr_mod, logger, session, meta, *args, **options): 27 | logical_path = meta["target"] 28 | logger.info("post_coll_create:[" + logical_path + "]") 29 | 30 | @staticmethod 31 | def pre_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options): 32 | logical_path = meta["target"] 33 | logger.info("pre_data_obj_modify:[" + logical_path + "]") 34 | 35 | @staticmethod 36 | def post_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options): 37 | logical_path = meta["target"] 38 | logger.info("post_data_obj_modify:[" + logical_path + "]") 39 | 40 | @staticmethod 41 | def pre_coll_modify(hdlr_mod, logger, session, meta, *args, **options): 42 | logical_path = meta["target"] 43 | logger.info("pre_coll_modify:[" + logical_path + "]") 44 | 45 | @staticmethod 46 | def post_coll_modify(hdlr_mod, logger, session, meta, *args, **options): 47 | logical_path = meta["target"] 48 | logger.info("post_coll_modify:[" + logical_path + "]") 49 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/register_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2a" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.REGISTER_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/replica_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.REGISTER_AS_REPLICA_SYNC 9 | 10 | @staticmethod 11 | def to_resource(session, meta, **options): 12 | return "regiResc2Root" 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/replica_with_non_leaf_non_root_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.REGISTER_AS_REPLICA_SYNC 9 | 10 | @staticmethod 11 | def to_resource(session, meta, **options): 12 | return "regiResc2" 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/replica_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.REGISTER_AS_REPLICA_SYNC 9 | 10 | @staticmethod 11 | def to_resource(session, meta, **options): 12 | return "regiResc2a" 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/retry.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | from irods_capability_automated_ingest.redis_utils import get_redis 4 | 5 | 6 | class event_handler(Core): 7 | @staticmethod 8 | def max_retries(hdlr_mod, logger, meta): 9 | return 3 10 | 11 | @staticmethod 12 | def delay(hdlr_mod, logger, meta, retries): 13 | return 0 14 | 15 | @staticmethod 16 | def operation(session, meta, **options): 17 | return Operation.NO_OP 18 | 19 | @staticmethod 20 | def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 21 | path = meta["path"] 22 | target = meta["target"] 23 | 24 | r = get_redis(meta["config"]) 25 | failures = r.get("failures:" + path) 26 | if failures is None: 27 | failures = 0 28 | 29 | r.incr("failures:" + path) 30 | 31 | if failures == 0: 32 | raise RuntimeError("no failures") 33 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/statistics.py: -------------------------------------------------------------------------------- 1 | from .. import redis_key 2 | from ..redis_utils import get_redis 3 | from irods_capability_automated_ingest.core import Core 4 | from irods_capability_automated_ingest.utils import Operation 5 | 6 | import time 7 | 8 | 9 | class event_handler(Core): 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.NO_OP 13 | 14 | @staticmethod 15 | def pre_job(hdlr_mod, logger, meta): 16 | job_name = meta["job_name"] 17 | config = meta["config"] 18 | r = get_redis(config) 19 | 20 | t0 = time.time() 21 | t0_key_handle = redis_key.redis_key_handle(r, "t0", job_name) 22 | t0_key_handle.set_value(t0) 23 | 24 | @staticmethod 25 | def post_job(hdlr_mod, logger, meta): 26 | job_name = meta["job_name"] 27 | config = meta["config"] 28 | t1 = time.time() 29 | r = get_redis(config) 30 | t0_key_handle = redis_key.redis_key_handle(r, "t0", job_name) 31 | t0 = t0_key_handle.get_value() 32 | t0_key_handle.reset() 33 | failures = redis_key.failures_key_handle(r, job_name) 34 | retries = redis_key.retries_key_handle(r, job_name) 35 | logger.info( 36 | "post_job", 37 | job_name=job_name, 38 | failures=failures.get_value(), 39 | retries=retries.get_value(), 40 | time_elasped=t1 - t0, 41 | ) 42 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/sync.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def operation(session, meta, **options): 8 | return Operation.PUT_SYNC 9 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/sync_non_leaf_non_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/sync_retry.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def max_retries(hdlr_mod, logger, meta): 8 | return 3 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/sync_root_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2Root" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/sync_with_resc_name.py: -------------------------------------------------------------------------------- 1 | from irods_capability_automated_ingest.core import Core 2 | from irods_capability_automated_ingest.utils import Operation 3 | 4 | 5 | class event_handler(Core): 6 | @staticmethod 7 | def to_resource(session, meta, **options): 8 | return "regiResc2a" 9 | 10 | @staticmethod 11 | def operation(session, meta, **options): 12 | return Operation.PUT_SYNC 13 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/examples/timeout.py: -------------------------------------------------------------------------------- 1 | import time 2 | from irods_capability_automated_ingest.core import Core 3 | from irods_capability_automated_ingest.utils import Operation 4 | 5 | 6 | class event_handler(Core): 7 | @staticmethod 8 | def operation(session, meta, **options): 9 | return Operation.NO_OP 10 | 11 | @staticmethod 12 | def timeout(hdlr_mod, logger, meta): 13 | return 1 14 | 15 | @staticmethod 16 | def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options): 17 | time.sleep(2) 18 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/irods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/irods/__init__.py -------------------------------------------------------------------------------- /irods_capability_automated_ingest/irods/filesystem.py: -------------------------------------------------------------------------------- 1 | from . import irods_utils 2 | from .. import custom_event_handler 3 | from ..utils import Operation 4 | 5 | from irods.models import Resource, DataObject, Collection 6 | import irods.keywords as kw 7 | 8 | import base64 9 | import os 10 | 11 | 12 | def append_to_data_object( 13 | session, logger, source_physical_path, destination_logical_path, **options 14 | ): 15 | BUFFER_SIZE = 1024 16 | logger.info( 17 | f"appending object {source_physical_path} from local filesystem, options = {options}" 18 | ) 19 | tsize = irods_utils.size(session, destination_logical_path) 20 | destination_fd = session.data_objects.open(destination_logical_path, "a", **options) 21 | destination_fd.seek(tsize) 22 | with open(source_physical_path, "rb") as source_fd: 23 | source_fd.seek(tsize) 24 | while True: 25 | buf = source_fd.read(BUFFER_SIZE) 26 | if buf == b"": 27 | break 28 | destination_fd.write(buf) 29 | destination_fd.close() 30 | logger.info("succeeded", task="irods_FSappend_file", path=source_physical_path) 31 | 32 | 33 | def register_file(hdlr_mod, logger, session, meta, **options): 34 | dest_dataobj_logical_fullpath = meta["target"] 35 | source_physical_fullpath = meta["path"] 36 | b64_path_str = meta.get("b64_path_str") 37 | 38 | event_handler = custom_event_handler.custom_event_handler(meta) 39 | if event_handler is None: 40 | phypath_to_register_in_catalog = None 41 | else: 42 | phypath_to_register_in_catalog = event_handler.target_path(session, **options) 43 | if phypath_to_register_in_catalog is None: 44 | if b64_path_str is not None and "unicode_error_filename" in meta: 45 | phypath_to_register_in_catalog = os.path.join( 46 | source_physical_fullpath, meta["unicode_error_filename"] 47 | ) 48 | else: 49 | phypath_to_register_in_catalog = source_physical_fullpath 50 | 51 | resc_name = event_handler.to_resource(session, **options) 52 | if resc_name is not None: 53 | options["destRescName"] = resc_name 54 | 55 | if b64_path_str is not None: 56 | source_physical_fullpath = base64.b64decode(b64_path_str) 57 | 58 | options[kw.DATA_SIZE_KW] = str(meta["size"]) 59 | options[kw.DATA_MODIFY_KW] = str(int(meta["mtime"])) 60 | 61 | logger.info( 62 | "registering object {}, source_physical_fullpath: {}, options = {}".format( 63 | dest_dataobj_logical_fullpath, source_physical_fullpath, options 64 | ) 65 | ) 66 | session.data_objects.register( 67 | phypath_to_register_in_catalog, dest_dataobj_logical_fullpath, **options 68 | ) 69 | 70 | logger.info("succeeded", task="irods_register_file", path=source_physical_fullpath) 71 | 72 | irods_utils.annotate_metadata_for_special_data_objs( 73 | meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath 74 | ) 75 | 76 | 77 | def upload_file(hdlr_mod, logger, session, meta, op, **options): 78 | """ 79 | Function called from sync_irods.sync_file and sync_irods.upload_file for local files 80 | 81 | If called from sync_irods.sync_file, determines if it should be an append operation, or a put. 82 | If called from sync_irods.upload_file, simply puts the file into iRODS 83 | 84 | op: the type of operation (one of Operation.PUT, Operation.PUT_APPEND, Operation.PUT_SYNC) 85 | """ 86 | # TODO: Check for op here 87 | 88 | dest_dataobj_logical_fullpath = meta["target"] 89 | source_physical_fullpath = meta["path"] 90 | b64_path_str = meta.get("b64_path_str") 91 | event_handler = custom_event_handler.custom_event_handler(meta) 92 | resc_name = event_handler.to_resource(session, **options) 93 | if resc_name is not None: 94 | options["destRescName"] = resc_name 95 | 96 | if b64_path_str is not None: 97 | source_physical_fullpath = base64.b64decode(b64_path_str) 98 | 99 | dest_dataobj_logical_fullpath = meta["target"] 100 | source_physical_fullpath = meta["path"] 101 | b64_path_str = meta.get("b64_path_str") 102 | if b64_path_str is not None: 103 | source_physical_fullpath = base64.b64decode(b64_path_str) 104 | 105 | logger.info( 106 | f"uploading object {source_physical_fullpath} from local filesystem, options = {options}" 107 | ) 108 | session.data_objects.put( 109 | source_physical_fullpath, dest_dataobj_logical_fullpath, **options 110 | ) 111 | logger.info("succeeded", task="irods_FSupload_file", path=source_physical_fullpath) 112 | 113 | irods_utils.annotate_metadata_for_special_data_objs( 114 | meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath 115 | ) 116 | 117 | 118 | def no_op(hdlr_mod, logger, session, meta, **options): 119 | pass 120 | 121 | 122 | def sync_file(hdlr_mod, logger, session, meta, op, **options): 123 | dest_dataobj_logical_fullpath = meta["target"] 124 | source_physical_fullpath = meta["path"] 125 | b64_path_str = meta.get("b64_path_str") 126 | 127 | event_handler = custom_event_handler.custom_event_handler(meta) 128 | resc_name = event_handler.to_resource(session, **options) 129 | if resc_name is not None: 130 | options["destRescName"] = resc_name 131 | 132 | if b64_path_str is not None: 133 | source_physical_fullpath = base64.b64decode(b64_path_str) 134 | 135 | logger.info( 136 | "syncing object %s, options = %s" % (dest_dataobj_logical_fullpath, options) 137 | ) 138 | 139 | # TODO: Issue #208 - This is incorrect -- where is the register function ?? 140 | # Investigate behavior of sync_file when op is None 141 | if op is None: 142 | op = Operation.REGISTER_SYNC 143 | 144 | # PUT_APPEND with existing file. sync_file assumes the file already exists. 145 | if op == Operation.PUT_APPEND: 146 | append_to_data_object( 147 | session, 148 | logger, 149 | source_physical_fullpath, 150 | dest_dataobj_logical_fullpath, 151 | **options, 152 | ) 153 | 154 | # If data object doesn't exist, just do a standard put 155 | # If data object does exist, we know op=PUT_SYNC, and we re-copy whole file again, so it is fine also 156 | else: 157 | logger.info( 158 | f"uploading object {source_physical_fullpath} from local filesystem, options = {options}" 159 | ) 160 | session.data_objects.put( 161 | source_physical_fullpath, dest_dataobj_logical_fullpath, **options 162 | ) 163 | logger.info( 164 | "succeeded", task="irods_FSupload_file", path=source_physical_fullpath 165 | ) 166 | 167 | 168 | def update_metadata(hdlr_mod, logger, session, meta, **options): 169 | dest_dataobj_logical_fullpath = meta["target"] 170 | source_physical_fullpath = meta["path"] 171 | event_handler = custom_event_handler.custom_event_handler(meta) 172 | phypath_to_register_in_catalog = event_handler.target_path(session, **options) 173 | b64_path_str = meta.get("b64_path_str") 174 | if phypath_to_register_in_catalog is None: 175 | if b64_path_str is not None and "unicode_error_filename" in meta: 176 | # Append generated filename to truncated fullpath because it failed to encode 177 | # TODO(#250): This will not work on Windows. 178 | phypath_to_register_in_catalog = os.path.join( 179 | source_physical_fullpath, meta["unicode_error_filename"] 180 | ) 181 | else: 182 | phypath_to_register_in_catalog = source_physical_fullpath 183 | 184 | if b64_path_str is not None: 185 | source_physical_fullpath = base64.b64decode(b64_path_str) 186 | 187 | size = int(meta["size"]) 188 | mtime = int(meta["mtime"]) 189 | logger.info( 190 | f"updating object: {dest_dataobj_logical_fullpath}, options = {options}" 191 | ) 192 | 193 | data_obj_info = {"objPath": dest_dataobj_logical_fullpath} 194 | 195 | outdated_repl_nums = [] 196 | found = False 197 | 198 | resc_name = event_handler.to_resource(session, **options) 199 | if resc_name is None: 200 | found = True 201 | else: 202 | for row in session.query( 203 | Resource.name, DataObject.path, DataObject.replica_number 204 | ).filter( 205 | # TODO(#250): This will not work on Windows. 206 | DataObject.name == os.path.basename(dest_dataobj_logical_fullpath), 207 | Collection.name == os.path.dirname(dest_dataobj_logical_fullpath), 208 | ): 209 | if row[DataObject.path] == phypath_to_register_in_catalog: 210 | if irods_utils.child_of(session, row[Resource.name], resc_name): 211 | found = True 212 | repl_num = row[DataObject.replica_number] 213 | data_obj_info["replNum"] = repl_num 214 | continue 215 | 216 | if not found: 217 | if b64_path_str is not None: 218 | logger.error( 219 | "updating object: wrong resource or path, " 220 | "dest_dataobj_logical_fullpath = {}, phypath_to_register_in_catalog = {}, options = {}".format( 221 | dest_dataobj_logical_fullpath, 222 | phypath_to_register_in_catalog, 223 | str(options), 224 | ) 225 | ) 226 | else: 227 | logger.error( 228 | "updating object: wrong resource or path, " 229 | "dest_dataobj_logical_fullpath = {}, source_physical_fullpath = {}, " 230 | "phypath_to_register_in_catalog = {}, options = {}".format( 231 | dest_dataobj_logical_fullpath, 232 | source_physical_fullpath, 233 | phypath_to_register_in_catalog, 234 | str(options), 235 | ) 236 | ) 237 | raise Exception("wrong resource or path") 238 | 239 | session.data_objects.modDataObjMeta( 240 | data_obj_info, 241 | {"dataSize": size, "dataModify": mtime, "allReplStatus": 1}, 242 | **options, 243 | ) 244 | 245 | if b64_path_str is not None: 246 | logger.info( 247 | "succeeded", 248 | task="irods_update_metadata", 249 | path=phypath_to_register_in_catalog, 250 | ) 251 | else: 252 | logger.info( 253 | "succeeded", task="irods_update_metadata", path=source_physical_fullpath 254 | ) 255 | 256 | 257 | def sync_file_meta(hdlr_mod, logger, session, meta, **options): 258 | pass 259 | 260 | 261 | def sync_data_from_file(hdlr_mod, meta, logger, content, **options): 262 | target = meta["target"] 263 | path = meta["path"] 264 | 265 | event_handler = custom_event_handler.custom_event_handler(meta) 266 | session = irods_utils.irods_session( 267 | event_handler.get_module(), meta, logger, **options 268 | ) 269 | 270 | if meta.get("initial_ingest"): 271 | # If the initial_ingest option has been specified, no checking is done for the existence 272 | # of the logical path for performance reasons. If the option is specified and the logical 273 | # path exists, an error will occur; this behavior is expected. 274 | exists = False 275 | else: 276 | exists = session.data_objects.exists(target) 277 | if not exists and session.collections.exists(target): 278 | raise Exception(f"sync: cannot sync file {path} to collection {target}") 279 | 280 | op = event_handler.operation(session, **options) 281 | 282 | if op == Operation.NO_OP: 283 | if not exists: 284 | event_handler.call( 285 | "on_data_obj_create", logger, no_op, logger, session, meta, **options 286 | ) 287 | else: 288 | event_handler.call( 289 | "on_data_obj_modify", logger, no_op, logger, session, meta, **options 290 | ) 291 | else: 292 | # allow_redirect will cause PRC to establish a direct connection between the client and the server hosting the 293 | # resource to which the data is being uploaded. This can cause problems if the hostnames being used in the 294 | # client environment and the hostname used for the "location" of the resource differ despite referring to the 295 | # same host. As such, we set the allow_redirect option to False in order to prevent this redirect. 296 | options["allow_redirect"] = False 297 | 298 | createRepl = False 299 | if op is None: 300 | op = Operation.REGISTER_SYNC 301 | elif exists and op == Operation.REGISTER_AS_REPLICA_SYNC: 302 | resc_name = event_handler.to_resource(session, **options) 303 | if resc_name is None: 304 | raise Exception("no resource name defined") 305 | 306 | found = False 307 | foundPath = False 308 | for replica in session.data_objects.get(target).replicas: 309 | if irods_utils.child_of(session, replica.resource_name, resc_name): 310 | found = True 311 | if replica.path == path: 312 | foundPath = True 313 | if not found: 314 | createRepl = True 315 | elif not foundPath: 316 | raise Exception( 317 | f"Data object [{target}] has at least one replica under resource [{resc_name}], but none of the replicas have physical paths which match [{path}]." 318 | ) 319 | 320 | put = op in [Operation.PUT, Operation.PUT_SYNC, Operation.PUT_APPEND] 321 | 322 | if not exists: 323 | meta2 = meta.copy() 324 | # TODO(#250): This will not work on Windows. 325 | meta2["target"] = os.path.dirname(target) 326 | if "b64_path_str" not in meta2: 327 | # TODO: This will not work on Windows. 328 | meta2["path"] = os.path.dirname(path) 329 | irods_utils.create_dirs(logger, session, meta2, **options) 330 | if put: 331 | event_handler.call( 332 | "on_data_obj_create", 333 | logger, 334 | upload_file, 335 | logger, 336 | session, 337 | meta, 338 | op, 339 | **options, 340 | ) 341 | else: 342 | event_handler.call( 343 | "on_data_obj_create", 344 | logger, 345 | register_file, 346 | logger, 347 | session, 348 | meta, 349 | **options, 350 | ) 351 | elif createRepl: 352 | options["regRepl"] = "" 353 | 354 | event_handler.call( 355 | "on_data_obj_create", 356 | logger, 357 | register_file, 358 | logger, 359 | session, 360 | meta, 361 | **options, 362 | ) 363 | elif content: 364 | if put: 365 | if Operation.PUT == op: 366 | logger.debug( 367 | f"PUT operation will ignore existing data object [{meta['target']}]" 368 | ) 369 | else: 370 | # PUT_SYNC and PUT_APPEND sync data on existing data objects. 371 | event_handler.call( 372 | "on_data_obj_modify", 373 | logger, 374 | sync_file, 375 | logger, 376 | session, 377 | meta, 378 | op, 379 | **options, 380 | ) 381 | else: 382 | event_handler.call( 383 | "on_data_obj_modify", 384 | logger, 385 | update_metadata, 386 | logger, 387 | session, 388 | meta, 389 | **options, 390 | ) 391 | else: 392 | event_handler.call( 393 | "on_data_obj_modify", 394 | logger, 395 | sync_file_meta, 396 | logger, 397 | session, 398 | meta, 399 | **options, 400 | ) 401 | 402 | irods_utils.start_timer() 403 | 404 | 405 | def sync_metadata_from_file(hdlr_mod, meta, logger, **options): 406 | sync_data_from_file(hdlr_mod, meta, logger, False, **options) 407 | 408 | 409 | def sync_dir_meta(hdlr_mod, logger, session, meta, **options): 410 | pass 411 | 412 | 413 | def sync_data_from_dir(hdlr_mod, meta, logger, content, **options): 414 | target = meta["target"] 415 | path = meta["path"] 416 | 417 | event_handler = custom_event_handler.custom_event_handler(meta) 418 | session = irods_utils.irods_session( 419 | event_handler.get_module(), meta, logger, **options 420 | ) 421 | exists = session.collections.exists(target) 422 | 423 | # TODO(#208): Should we default to REGISTER_SYNC? 424 | op = event_handler.operation(session, **options) or Operation.REGISTER_SYNC 425 | if op == Operation.NO_OP: 426 | operation_name = "on_coll_modify" if exists else "on_coll_create" 427 | event_handler.call( 428 | operation_name, logger, no_op, logger, session, meta, **options 429 | ) 430 | else: 431 | if not exists: 432 | irods_utils.create_dirs(logger, session, meta, **options) 433 | else: 434 | event_handler.call( 435 | "on_coll_modify", 436 | logger, 437 | sync_dir_meta, 438 | logger, 439 | session, 440 | meta, 441 | **options, 442 | ) 443 | irods_utils.start_timer() 444 | 445 | 446 | def sync_metadata_from_dir(hdlr_mod, meta, logger, **options): 447 | sync_data_from_dir(hdlr_mod, meta, logger, False, **options) 448 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/irods/irods_utils.py: -------------------------------------------------------------------------------- 1 | from .. import custom_event_handler, sync_logging 2 | from ..redis_utils import get_redis 3 | from ..utils import DeleteMode, Operation 4 | 5 | from irods.exception import CollectionDoesNotExist, NetworkException 6 | from irods.models import Collection, DataObject, Resource 7 | from irods.session import iRODSSession 8 | 9 | import base64 10 | import json 11 | import os 12 | import redis_lock 13 | import ssl 14 | import threading 15 | 16 | irods_session_map = {} 17 | irods_session_timer_map = {} 18 | 19 | 20 | class disconnect_timer(object): 21 | def __init__(self, logger, interval, sess_map): 22 | self.logger = logger 23 | self.interval = interval 24 | self.timer = None 25 | self.sess_map = sess_map 26 | 27 | def callback(self): 28 | for k, v in self.sess_map.items(): 29 | self.logger.info("Cleaning up session [" + k + "]") 30 | v.cleanup() 31 | self.sess_map.clear() 32 | 33 | def cancel(self): 34 | if self.timer is not None: 35 | self.timer.cancel() 36 | 37 | def start(self): 38 | self.timer = threading.Timer(self.interval, self.callback) 39 | self.timer.start() 40 | 41 | 42 | def stop_timer(): 43 | for k, v in irods_session_timer_map.items(): 44 | v.cancel() 45 | 46 | 47 | def start_timer(): 48 | for k, v in irods_session_timer_map.items(): 49 | v.start() 50 | 51 | 52 | def irods_session(handler_module, meta, logger, **options): 53 | env_irods_host = os.environ.get("IRODS_HOST") 54 | env_irods_port = os.environ.get("IRODS_PORT") 55 | env_irods_user_name = os.environ.get("IRODS_USER_NAME") 56 | env_irods_zone_name = os.environ.get("IRODS_ZONE_NAME") 57 | env_irods_password = os.environ.get("IRODS_PASSWORD") 58 | 59 | env_file = os.environ.get("IRODS_ENVIRONMENT_FILE") 60 | 61 | kwargs = {} 62 | if all( 63 | [ 64 | env_irods_host, 65 | env_irods_port, 66 | env_irods_user_name, 67 | env_irods_zone_name, 68 | env_irods_password, 69 | ] 70 | ): 71 | kwargs["host"] = env_irods_host 72 | kwargs["port"] = env_irods_port 73 | kwargs["user"] = env_irods_user_name 74 | kwargs["zone"] = env_irods_zone_name 75 | kwargs["password"] = env_irods_password 76 | else: 77 | if not env_file: 78 | # TODO(#250): This will not work on Windows. 79 | env_file = os.path.expanduser("~/.irods/irods_environment.json") 80 | 81 | kwargs["irods_env_file"] = env_file 82 | 83 | if hasattr(handler_module, "as_user"): 84 | client_zone, client_user = handler_module.as_user(meta, **options) 85 | kwargs["client_user"] = client_user 86 | kwargs["client_zone"] = client_zone 87 | 88 | key = json.dumps(kwargs) # todo add timestamp of env file to key 89 | 90 | if env_file: 91 | if not os.path.exists(env_file): 92 | raise FileNotFoundError( 93 | f"Specified iRODS client environment file [{env_file}] does not exist." 94 | ) 95 | 96 | with open(env_file) as irods_env: 97 | irods_env_as_json = json.load(irods_env) 98 | verify_server = irods_env_as_json.get("irods_ssl_verify_server") 99 | ca_file = irods_env_as_json.get("irods_ssl_ca_certificate_file") 100 | if verify_server and verify_server != "none" and ca_file: 101 | kwargs["ssl_context"] = ssl.create_default_context( 102 | purpose=ssl.Purpose.SERVER_AUTH, 103 | cafile=ca_file, 104 | capath=None, 105 | cadata=None, 106 | ) 107 | 108 | if key in irods_session_map: 109 | sess = irods_session_map.get(key) 110 | else: 111 | # TODO: #42 - pull out 10 into configuration 112 | for i in range(10): 113 | try: 114 | sess = iRODSSession(**kwargs) 115 | irods_session_map[key] = sess 116 | break 117 | except NetworkException: 118 | time.sleep(0.1) 119 | 120 | # =-=-=-=-=-=-=- 121 | # disconnect timer 122 | if key in irods_session_timer_map: 123 | timer = irods_session_timer_map[key] 124 | timer.cancel() 125 | irods_session_timer_map.pop(key, None) 126 | idle_sec = meta["idle_disconnect_seconds"] 127 | logger.info("iRODS Idle Time set to: " + str(idle_sec)) 128 | 129 | timer = disconnect_timer(logger, idle_sec, irods_session_map) 130 | irods_session_timer_map[key] = timer 131 | # =-=-=-=-=-=-=- 132 | 133 | return sess 134 | 135 | 136 | def validate_target_collection(meta, logger): 137 | # root cannot be the target collection 138 | destination_collection_logical_path = meta["target"] 139 | if destination_collection_logical_path == "/": 140 | raise Exception("Root may only contain collections which represent zones") 141 | 142 | 143 | def child_of(session, child_resc_name, resc_name): 144 | if child_resc_name == resc_name: 145 | return True 146 | else: 147 | while True: 148 | child_resc = session.resources.get(child_resc_name) 149 | parent_resc_id = child_resc.parent 150 | if parent_resc_id is None: 151 | break 152 | 153 | parent_resc_name = None 154 | for row in session.query(Resource.name).filter( 155 | Resource.id == parent_resc_id 156 | ): 157 | parent_resc_name = row[Resource.name] 158 | if parent_resc_name == resc_name: 159 | return True 160 | child_resc_name = parent_resc_name 161 | return False 162 | 163 | 164 | def create_dirs(logger, session, meta, **options): 165 | target = meta["target"] 166 | path = meta["path"] 167 | config = meta["config"] 168 | event_handler = custom_event_handler.custom_event_handler(meta) 169 | if target.startswith("/"): 170 | r = get_redis(config) 171 | if not session.collections.exists(target): 172 | with redis_lock.Lock(r, "create_dirs:" + target): 173 | if not session.collections.exists(target): 174 | meta2 = meta.copy() 175 | # TODO(#250): This will not work on Windows. 176 | meta2["target"] = os.path.dirname(target) 177 | meta2["path"] = os.path.dirname(path) 178 | # TODO: Does this need to happen after the create call? 179 | create_dirs(logger, session, meta2, **options) 180 | 181 | event_handler.call( 182 | "on_coll_create", 183 | logger, 184 | create_dir, 185 | logger, 186 | session, 187 | meta, 188 | **options, 189 | ) 190 | else: 191 | raise Exception( 192 | "create_dirs: relative path; target:[" + target + "]; path:[" + path + "]" 193 | ) 194 | 195 | 196 | def create_dir(hdlr_mod, logger, session, meta, **options): 197 | target = meta["target"] 198 | path = meta["path"] 199 | logger.info("creating collection " + target) 200 | session.collections.create(target) 201 | 202 | 203 | def annotate_metadata_for_special_data_objs( 204 | meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath 205 | ): 206 | def add_metadata_if_not_present(obj, key, val, unit=None): 207 | # TODO: If updating/syncing link items, we might want to update the readlink result... 208 | if key not in obj.metadata.keys(): 209 | obj.metadata.add(key, val, unit) 210 | 211 | b64_path_str = meta.get("b64_path_str") or meta.get("b64_path_str_charmap") 212 | if b64_path_str is not None: 213 | b64_reason = meta.get("b64_reason") 214 | if b64_reason in ("UnicodeEncodeError", "character_map"): 215 | add_metadata_if_not_present( 216 | session.data_objects.get(dest_dataobj_logical_fullpath), 217 | "irods::automated_ingest::{}".format(b64_reason), 218 | b64_path_str, 219 | "python3.base64.b64encode(full_path_of_source_file)", 220 | ) 221 | 222 | if meta["is_socket"]: 223 | add_metadata_if_not_present( 224 | session.data_objects.get(dest_dataobj_logical_fullpath), 225 | "socket_target", 226 | "socket", 227 | "automated_ingest", 228 | ) 229 | elif meta["is_link"]: 230 | add_metadata_if_not_present( 231 | session.data_objects.get(dest_dataobj_logical_fullpath), 232 | "link_target", 233 | os.path.join( 234 | os.path.dirname(source_physical_fullpath), 235 | os.readlink(source_physical_fullpath), 236 | ), 237 | "automated_ingest", 238 | ) 239 | 240 | 241 | def size(session, path, replica_num=None, resc_name=None): 242 | args = [ 243 | Collection.name == os.path.dirname(path), 244 | DataObject.name == os.path.basename(path), 245 | ] 246 | 247 | if replica_num is not None: 248 | args.append(DataObject.replica_number == replica_num) 249 | 250 | if resc_name is not None: 251 | args.append(DataObject.resource_name == resc_name) 252 | 253 | for row in session.query(DataObject.size).filter(*args): 254 | return int(row[DataObject.size]) 255 | 256 | 257 | def list_collection(meta, logger, logical_path): 258 | event_handler = custom_event_handler.custom_event_handler(meta) 259 | session = irods_session(event_handler.get_module(), meta, logger, **dict()) 260 | 261 | collection = session.collections.get(logical_path) 262 | 263 | return collection.subcollections, collection.data_objects 264 | 265 | 266 | def unregister_data_object(hdlr_mod, session, meta, **options): 267 | config = meta["config"] 268 | logging_config = config["log"] 269 | logger = sync_logging.get_sync_logger(logging_config) 270 | logger.debug(f"calling unregister for [{meta['target']}]") 271 | session.data_objects.unregister(meta["target"], **options) 272 | 273 | 274 | def trash_data_object(hdlr_mod, session, meta, **options): 275 | config = meta["config"] 276 | logging_config = config["log"] 277 | logger = sync_logging.get_sync_logger(logging_config) 278 | logger.debug(f"calling unlink (trash) for [{meta['target']}]") 279 | session.data_objects.unlink(meta["target"], **options) 280 | 281 | 282 | def unlink_data_object(hdlr_mod, session, meta, **options): 283 | config = meta["config"] 284 | logging_config = config["log"] 285 | logger = sync_logging.get_sync_logger(logging_config) 286 | logger.debug(f"calling unlink (no trash / force=True) for [{meta['target']}]") 287 | session.data_objects.unlink(meta["target"], force=True, **options) 288 | 289 | 290 | def get_delete_function(delete_mode): 291 | delete_mode_to_function = { 292 | DeleteMode.DO_NOT_DELETE: None, 293 | DeleteMode.UNREGISTER: unregister_data_object, 294 | DeleteMode.TRASH: trash_data_object, 295 | DeleteMode.NO_TRASH: unlink_data_object, 296 | } 297 | return delete_mode_to_function.get(delete_mode, None) 298 | 299 | 300 | def delete_data_object(hdlr_mod, meta, **options): 301 | logical_path = meta["target"] 302 | 303 | event_handler = custom_event_handler.custom_event_handler(meta) 304 | 305 | delete_mode = event_handler.delete_mode() 306 | if DeleteMode.DO_NOT_DELETE == delete_mode: 307 | # The event handler says "do not delete", so do not delete. 308 | return 309 | 310 | config = meta["config"] 311 | logging_config = config["log"] 312 | logger = sync_logging.get_sync_logger(logging_config) 313 | 314 | session = irods_session(event_handler.get_module(), meta, logger, **options) 315 | 316 | if not session.data_objects.exists(logical_path): 317 | # There is nothing to do if the data object does not exist. 318 | return 319 | 320 | delete_function = get_delete_function(delete_mode) 321 | if delete_function is None: 322 | raise RuntimeError(f"delete_mode [{delete_mode}] is not supported") 323 | 324 | event_handler.call( 325 | "on_data_obj_delete", logger, delete_function, session, meta, **options 326 | ) 327 | 328 | 329 | def unregister_collection(hdlr_mod, session, meta, **options): 330 | config = meta["config"] 331 | logging_config = config["log"] 332 | logger = sync_logging.get_sync_logger(logging_config) 333 | logger.debug(f"calling unregister for [{meta['target']}]") 334 | # We should only be removing an empty collection, so explicitly do not remove recursively or do a "force" remove. 335 | options["recurse"] = False 336 | options["force"] = False 337 | session.collections.unregister(meta["target"], **options) 338 | 339 | 340 | def delete_collection(hdlr_mod, meta, **options): 341 | logical_path = meta["target"] 342 | 343 | event_handler = custom_event_handler.custom_event_handler(meta) 344 | 345 | delete_mode = event_handler.delete_mode() 346 | if DeleteMode.DO_NOT_DELETE == delete_mode: 347 | # The event handler says "do not delete", so do not delete. 348 | return 349 | 350 | config = meta["config"] 351 | logging_config = config["log"] 352 | logger = sync_logging.get_sync_logger(logging_config) 353 | 354 | session = irods_session(event_handler.get_module(), meta, logger, **options) 355 | 356 | r = get_redis(config) 357 | with redis_lock.Lock(r, "delete_collection:" + logical_path): 358 | # This will raise CollectionDoesNotExist if logical_path does not exist. 359 | collection = session.collections.get(logical_path) 360 | 361 | if 0 != len(collection.data_objects) or 0 != len(collection.subcollections): 362 | logger.debug( 363 | f"Collection [{logical_path}] is not empty and will not be removed." 364 | ) 365 | return 366 | 367 | event_handler.call( 368 | "on_coll_delete", logger, unregister_collection, session, meta, **options 369 | ) 370 | 371 | # Attempt to remove the parent collection if it is found to be empty. 372 | root_target_collection = meta["root_target_collection"] 373 | parent_collection = "/".join(logical_path.split("/")[:-1]) 374 | if parent_collection == root_target_collection: 375 | logger.info(f"Cannot remove root target collection [{root_target_collection}]") 376 | return 377 | with redis_lock.Lock(r, "delete_collection:" + parent_collection): 378 | # This will raise CollectionDoesNotExist if logical_path does not exist. 379 | collection = session.collections.get(parent_collection) 380 | if 0 != len(collection.data_objects) or 0 != len(collection.subcollections): 381 | logger.debug( 382 | f"Collection [{parent_collection}] is not empty and will not be removed." 383 | ) 384 | return 385 | event_handler.call( 386 | "on_coll_delete", logger, unregister_collection, session, meta, **options 387 | ) 388 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/irods_sync.py: -------------------------------------------------------------------------------- 1 | from uuid import uuid1 2 | from . import sync_actions 3 | import argparse 4 | import json 5 | import sys 6 | 7 | 8 | def get_config(args): 9 | return { 10 | "log": { 11 | "filename": getattr(args, "log_filename", None), 12 | "when": getattr(args, "log_when", None), 13 | "interval": getattr(args, "log_interval", None), 14 | "level": getattr(args, "log_level", None), 15 | }, 16 | "profile": { 17 | "filename": getattr(args, "profile_filename", None), 18 | "when": getattr(args, "profile_when", None), 19 | "interval": getattr(args, "profile_interval", None), 20 | "level": getattr(args, "profile_level", None), 21 | }, 22 | "redis": { 23 | "host": args.redis_host, 24 | "port": args.redis_port, 25 | "db": args.redis_db, 26 | }, 27 | } 28 | 29 | 30 | def get_celery_broker_info(): 31 | from os import environ 32 | 33 | env_url = environ["CELERY_BROKER_URL"] 34 | if env_url is None: 35 | host = "localhost" 36 | port = 6379 37 | db = 0 38 | else: 39 | url = env_url.split("://")[1].split(":") 40 | host = url[0] 41 | port = url[1].split("/")[0] 42 | db = url[1].split("/")[1] 43 | 44 | return host, port, db 45 | 46 | 47 | class character_map_argument_error(Exception): 48 | pass 49 | 50 | 51 | # Make sure, if a character_map method is defined for the given event handler, that it 52 | # returns a dictionary (or argument for construction of dictionary) appropriate within the 53 | # conventions laid out in the README. Also, within reason, check any characters explicitly 54 | # named for remapping. To satisfy the principle of least surprise, they should at least 55 | # be restricted to being strings of length one. 56 | 57 | 58 | def check_event_handler(filename): 59 | namespace = {} 60 | if filename is not None: 61 | exec(open(filename, "r").read(), namespace, namespace) 62 | ev_hdlr_class = namespace["event_handler"] 63 | char_map_method = getattr(ev_hdlr_class, "character_map", None) 64 | error_message = "" 65 | if char_map_method: 66 | returned = char_map_method() 67 | try: 68 | char_mapper = dict(returned) 69 | except TypeError: 70 | error_message = "character_map() method must return a dict or iterable of key value tuples" 71 | raise character_map_argument_error(error_message) 72 | for key, value in char_mapper.items(): 73 | if ( 74 | isinstance(key, str) 75 | and len(key) > 1 76 | or isinstance(key, tuple) 77 | and any(len(s) > 1 for s in key) 78 | or isinstance(value, str) 79 | and len(value) > 1 80 | ): 81 | error_message = "character_map()'s returned object should denote only single-character substitutions" 82 | raise character_map_argument_error(error_message) 83 | 84 | 85 | def add_arguments(parser): 86 | host, port, db = get_celery_broker_info() 87 | 88 | parser.add_argument( 89 | "--log_filename", 90 | action="store", 91 | type=str, 92 | default=None, 93 | help="Specify name of log file.", 94 | ) 95 | parser.add_argument( 96 | "--log_when", 97 | action="store", 98 | type=str, 99 | default=None, 100 | help="Specify the type of log_interval (see TimedRotatingFileHandler).", 101 | ) 102 | parser.add_argument( 103 | "--log_interval", 104 | action="store", 105 | type=int, 106 | default=None, 107 | help="Specify the interval with which to rollover the ingest log file.", 108 | ) 109 | parser.add_argument( 110 | "--log_level", 111 | action="store", 112 | type=str, 113 | default=None, 114 | help="Specify minimum level of message to log (DEBUG, INFO, WARNING, ERROR).", 115 | ) 116 | parser.add_argument( 117 | "--profile_filename", 118 | action="store", 119 | type=str, 120 | default=None, 121 | help="Specify name of profile filename.", 122 | ) 123 | parser.add_argument( 124 | "--profile_when", 125 | action="store", 126 | type=str, 127 | default=None, 128 | help="Specify the type of profile_interval (see TimedRotatingFileHandler).", 129 | ) 130 | parser.add_argument( 131 | "--profile_interval", 132 | action="store", 133 | type=int, 134 | default=None, 135 | help="Specify the interval with which to rollover the ingest profile log file.", 136 | ) 137 | parser.add_argument( 138 | "--profile_level", 139 | action="store", 140 | type=str, 141 | default=None, 142 | help="Specify minimum level of message to log for profiling (DEBUG, INFO, WARNING, ERROR).", 143 | ) 144 | parser.add_argument( 145 | "--redis_host", 146 | action="store", 147 | type=str, 148 | default=host, 149 | help="Domain or IP address of Redis host.", 150 | ) 151 | parser.add_argument( 152 | "--redis_port", 153 | action="store", 154 | type=int, 155 | default=port, 156 | help="Port number for Redis.", 157 | ) 158 | parser.add_argument( 159 | "--redis_db", 160 | action="store", 161 | type=int, 162 | default=db, 163 | help="Redis DB number to use for ingest.", 164 | ) 165 | 166 | 167 | def handle_start(args): 168 | ex_file_arg = args.exclude_file_type 169 | if ex_file_arg != None: 170 | ex_arg_list = [x.strip() for x in ex_file_arg[0].split(",")] 171 | 172 | check_event_handler(args.event_handler) 173 | 174 | data = {} 175 | data["restart_queue"] = args.restart_queue 176 | data["path_queue"] = args.path_queue 177 | data["file_queue"] = args.file_queue 178 | data["target"] = args.target 179 | data["src_path"] = args.src_path 180 | data["interval"] = args.interval 181 | data["job_name"] = args.job_name if args.job_name else str(uuid1()) 182 | data["ignore_cache"] = args.ignore_cache 183 | data["initial_ingest"] = args.initial_ingest 184 | data["event_handler"] = args.event_handler 185 | data["config"] = get_config(args) 186 | data["synchronous"] = args.synchronous 187 | data["progress"] = args.progress 188 | data["profile"] = args.profile 189 | data["files_per_task"] = args.files_per_task 190 | data["s3_endpoint_domain"] = args.s3_endpoint_domain 191 | data["s3_region_name"] = args.s3_region_name 192 | data["s3_keypair"] = args.s3_keypair 193 | data["s3_proxy_url"] = args.s3_proxy_url 194 | data["s3_secure_connection"] = not args.s3_insecure_connection 195 | data["s3_multipart_chunksize_in_mib"] = args.s3_multipart_chunksize_in_mib 196 | data["exclude_file_type"] = ex_arg_list 197 | data["exclude_file_name"] = ["".join(r) for r in args.exclude_file_name] 198 | data["exclude_directory_name"] = ["".join(r) for r in args.exclude_directory_name] 199 | data["idle_disconnect_seconds"] = args.irods_idle_disconnect_seconds 200 | 201 | return sync_actions.start_job(data) 202 | 203 | 204 | def handle_stop(args): 205 | sync_actions.stop_job(args.job_name, get_config(args)) 206 | return 0 207 | 208 | 209 | def handle_watch(args): 210 | return sync_actions.monitor_job(args.job_name, True, get_config(args)) 211 | 212 | 213 | def handle_list(args): 214 | jobs = sync_actions.list_jobs(get_config(args)) 215 | print(json.dumps(jobs)) 216 | return 0 217 | 218 | 219 | def main(): 220 | parser = argparse.ArgumentParser(description="continuous synchronization utility") 221 | subparsers = parser.add_subparsers(help="subcommand help") 222 | 223 | parser_start = subparsers.add_parser( 224 | "start", 225 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 226 | help="start help", 227 | ) 228 | parser_start.add_argument( 229 | "src_path", 230 | metavar="SOURCE_DIRECTORY", 231 | type=str, 232 | help="Source directory or S3 folder to scan.", 233 | ) 234 | parser_start.add_argument( 235 | "target", 236 | metavar="TARGET_COLLECTION", 237 | type=str, 238 | help="Target iRODS collection for data objects (created if non-existent).", 239 | ) 240 | parser_start.add_argument( 241 | "-i", 242 | "--interval", 243 | action="store", 244 | type=int, 245 | default=None, 246 | help="Restart interval (in seconds). If absent, will only sync once.", 247 | ) 248 | parser_start.add_argument( 249 | "--file_queue", 250 | action="store", 251 | type=str, 252 | default="file", 253 | help="Name for the file queue.", 254 | ) 255 | parser_start.add_argument( 256 | "--path_queue", 257 | action="store", 258 | type=str, 259 | default="path", 260 | help="Name for the path queue.", 261 | ) 262 | parser_start.add_argument( 263 | "--restart_queue", 264 | action="store", 265 | type=str, 266 | default="restart", 267 | help="Name for the restart queue.", 268 | ) 269 | parser_start.add_argument( 270 | "--event_handler", 271 | action="store", 272 | type=str, 273 | default=None, 274 | help="Path to event handler file", 275 | ) 276 | parser_start.add_argument( 277 | "--job_name", 278 | action="store", 279 | type=str, 280 | default=None, 281 | help="Reference name for ingest job (defaults to generated uuid)", 282 | ) 283 | parser_start.add_argument( 284 | "--ignore_cache", 285 | action="store_true", 286 | default=False, 287 | help="Ignore last sync time in cache - like starting a new sync", 288 | ) 289 | parser_start.add_argument( 290 | "--initial_ingest", 291 | action="store_true", 292 | default=False, 293 | help="Use this flag on initial ingest to avoid check for data object paths already in iRODS.", 294 | ) 295 | parser_start.add_argument( 296 | "--synchronous", 297 | action="store_true", 298 | default=False, 299 | help="Block until sync job is completed.", 300 | ) 301 | parser_start.add_argument( 302 | "--progress", 303 | action="store_true", 304 | default=False, 305 | help="Show progress bar and task counts (must have --synchronous flag).", 306 | ) 307 | parser_start.add_argument( 308 | "--profile", 309 | action="store_true", 310 | default=False, 311 | help="Generate JSON file of system activity profile during ingest.", 312 | ) 313 | parser_start.add_argument( 314 | "--files_per_task", 315 | action="store", 316 | type=int, 317 | default="50", 318 | help="Number of paths to process in a given task on the queue.", 319 | ) 320 | parser_start.add_argument( 321 | "--s3_endpoint_domain", 322 | action="store", 323 | type=str, 324 | default="s3.amazonaws.com", 325 | help="S3 endpoint domain", 326 | ) 327 | parser_start.add_argument( 328 | "--s3_region_name", 329 | action="store", 330 | type=str, 331 | default="us-east-1", 332 | help="S3 region name", 333 | ) 334 | parser_start.add_argument( 335 | "--s3_keypair", 336 | action="store", 337 | type=str, 338 | default=None, 339 | help="Path to S3 keypair file", 340 | ) 341 | parser_start.add_argument( 342 | "--s3_proxy_url", 343 | action="store", 344 | type=str, 345 | default=None, 346 | help="URL to proxy for S3 access", 347 | ) 348 | parser_start.add_argument( 349 | "--s3_insecure_connection", 350 | action="store_true", 351 | default=False, 352 | help="Do not use SSL when connecting to S3 endpoint", 353 | ) 354 | parser_start.add_argument( 355 | "--s3_multipart_chunksize_in_mib", 356 | action="store", 357 | type=int, 358 | default=8, 359 | choices=range(5, 5001), 360 | metavar="[5-5000]", 361 | help="Chunk size in mebibytes for multipart S3 uploads. Minimum part size is 5 MiB and the maximum part size is 5000 MiB.", 362 | ) 363 | parser_start.add_argument( 364 | "--exclude_file_type", 365 | nargs=1, 366 | action="store", 367 | default="none", 368 | help="types of files to exclude: regular, directory, character, block, socket, pipe, link", 369 | ) 370 | parser_start.add_argument( 371 | "--exclude_file_name", 372 | type=list, 373 | nargs="+", 374 | action="store", 375 | default="none", 376 | help='a list of space-separated python regular expressions defining the file names to exclude such as "(\S+)exclude" "(\S+)\.hidden"', 377 | ) 378 | parser_start.add_argument( 379 | "--exclude_directory_name", 380 | type=list, 381 | nargs="+", 382 | action="store", 383 | default="none", 384 | help='a list of space-separated python regular expressions defining the directory names to exclude such as "(\S+)exclude" "(\S+)\.hidden"', 385 | ) 386 | parser_start.add_argument( 387 | "--irods_idle_disconnect_seconds", 388 | action="store", 389 | type=int, 390 | default=60, 391 | help="irods disconnect time in seconds", 392 | ) 393 | add_arguments(parser_start) 394 | 395 | parser_start.set_defaults(func=handle_start) 396 | 397 | parser_stop = subparsers.add_parser( 398 | "stop", formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="stop help" 399 | ) 400 | parser_stop.add_argument("job_name", action="store", type=str, help="job name") 401 | add_arguments(parser_stop) 402 | parser_stop.set_defaults(func=handle_stop) 403 | 404 | parser_watch = subparsers.add_parser( 405 | "watch", 406 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 407 | help="watch help", 408 | ) 409 | parser_watch.add_argument("job_name", action="store", type=str, help="job name") 410 | add_arguments(parser_watch) 411 | parser_watch.set_defaults(func=handle_watch) 412 | 413 | parser_list = subparsers.add_parser( 414 | "list", formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="list help" 415 | ) 416 | add_arguments(parser_list) 417 | parser_list.set_defaults(func=handle_list) 418 | 419 | args = parser.parse_args() 420 | sys.exit(args.func(args)) 421 | 422 | 423 | if __name__ == "__main__": 424 | main() 425 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/redis_key.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import traceback 4 | 5 | MAX_RETRIES = 10 6 | 7 | 8 | # TODO: Consider compression/hashing of key_category and identifier 9 | class redis_key_handle(object): 10 | # def __init__(self, logger, redis_handle, key_category, identifier, delimiter=':/'): 11 | def __init__(self, redis_handle, key_category, identifier, delimiter=":/"): 12 | # self.logger = logger 13 | self.redis_handle = redis_handle 14 | self.category = key_category 15 | self.identifier = identifier 16 | self.delimiter = delimiter 17 | # TODO: Hard-coded value from .utils 18 | 19 | def retry(self, func, *args, max_retries=MAX_RETRIES): 20 | retries = 0 21 | while retries <= max_retries: 22 | try: 23 | res = func(*args) 24 | return res 25 | except Exception as err: 26 | retries += 1 27 | 28 | # logger.info('Retrying. retries=' + str(retries), max_retries=max_retries, func=func, args=args, err=err, stacktrace=traceback.extract_tb(err.__traceback__)) 29 | time.sleep(1) 30 | raise RuntimeError("max retries") 31 | 32 | def get_key(self): 33 | return str(self.category + self.delimiter + self.identifier) 34 | 35 | def get_value(self): 36 | if self.get_key() is None: 37 | return None 38 | return self.retry(self.redis_handle.get, self.get_key()) 39 | 40 | def set_value(self, value): 41 | self.retry(self.redis_handle.set, self.get_key(), value) 42 | 43 | def reset(self): 44 | self.retry(self.redis_handle.delete, self.get_key()) 45 | 46 | 47 | class incremental_redis_key_handle(redis_key_handle): 48 | def __init__(self, redis_handle, key_category, identifier, delimiter=":/"): 49 | super().__init__(redis_handle, key_category, identifier, delimiter) 50 | 51 | def get_value(self): 52 | val = super().get_value() 53 | if val is None: 54 | return val 55 | return int(val) 56 | 57 | def incrby(self, amount=1): 58 | self.retry(self.redis_handle.incrby, self.get_key(), amount) 59 | 60 | def incr(self): 61 | self.retry(self.redis_handle.incr, self.get_key()) 62 | 63 | def decrby(self, amount=1): 64 | self.retry(self.redis_handle.decrby, self.get_key(), amount) 65 | 66 | def decr(self): 67 | return self.retry(self.redis_handle.decr, self.get_key()) 68 | 69 | 70 | class json_redis_key_handle(redis_key_handle): 71 | def __init__(self, redis_handle, key_category, identifier, delimiter=":/"): 72 | super().__init__(redis_handle, key_category, identifier, delimiter) 73 | 74 | # def get_value(self): 75 | # return json.loads(self.retry(self.redis_handle.get, self.get_key().decode("utf-8"))) 76 | 77 | 78 | class list_redis_key_handle(redis_key_handle): 79 | def __init__(self, redis_handle, key_category, identifier, delimiter=":/"): 80 | super().__init__(redis_handle, key_category, identifier, delimiter) 81 | 82 | def get_value(self): 83 | val = super().get_value() 84 | if val is None: 85 | return val 86 | return list(val) 87 | 88 | def rpush(self, value): 89 | self.retry(self.redis_handle.rpush, self.get_key(), value) 90 | 91 | def lrange(self, start, end): 92 | return self.retry(self.redis_handle.lrange, self.get_key(), start, end) 93 | 94 | def llen(self): 95 | return self.retry(self.redis_handle.llen, self.get_key()) 96 | 97 | 98 | class float_redis_key_handle(redis_key_handle): 99 | def __init__(self, redis_handle, key_category, identifier, delimiter=":/"): 100 | super().__init__(redis_handle, key_category, identifier, delimiter) 101 | 102 | def get_value(self): 103 | val = super().get_value() 104 | if val is None: 105 | return val 106 | return float(val) 107 | 108 | 109 | # TODO(#292): python metaclasses - see PRC 110 | class sync_time_key_handle(float_redis_key_handle): 111 | """Float indicating the last time path was synced.""" 112 | 113 | def __init__(self, redis_handle, path): 114 | super().__init__(redis_handle, "sync_time", path) 115 | 116 | 117 | class cleanup_key_handle(json_redis_key_handle): 118 | """JSON object with list of event_handlers that need to be cleaned up.""" 119 | 120 | def __init__(self, redis_handle, job_name): 121 | super().__init__(redis_handle, "cleanup", job_name) 122 | 123 | 124 | class stop_key_handle(redis_key_handle): 125 | """Empty string indicating that the job job_name_to_stop is being stopped.""" 126 | 127 | def __init__(self, redis_handle, job_name_to_stop): 128 | super().__init__(redis_handle, "stop", job_name_to_stop) 129 | 130 | def get_value(self): 131 | val = super().get_value() 132 | if val is None: 133 | return val 134 | return str(val) 135 | 136 | 137 | class tasks_key_handle(incremental_redis_key_handle): 138 | """Integer indicating the task count for job_name.""" 139 | 140 | def __init__(self, redis_handle, job_name): 141 | super().__init__(redis_handle, "tasks", job_name) 142 | 143 | 144 | class count_key_handle(list_redis_key_handle): 145 | """List of task IDs associated with job_name.""" 146 | 147 | def __init__(self, redis_handle, job_name): 148 | super().__init__(redis_handle, "count", job_name) 149 | 150 | 151 | # TODO: What is the difference between this list and the set of stop_keys? 152 | class dequeue_key_handle(list_redis_key_handle): 153 | """List of tasks for a particular job_name.""" 154 | 155 | def __init__(self, redis_handle, job_name): 156 | super().__init__(redis_handle, "dequeue", job_name) 157 | 158 | 159 | class failures_key_handle(incremental_redis_key_handle): 160 | """Integer indicating the count of failed tasks for job_name.""" 161 | 162 | def __init__(self, redis_handle, job_name): 163 | super().__init__(redis_handle, "failures", job_name) 164 | 165 | 166 | class retries_key_handle(incremental_redis_key_handle): 167 | """Integer indicating the count of tasks which were retried for job_name.""" 168 | 169 | def __init__(self, redis_handle, job_name): 170 | super().__init__(redis_handle, "retries", job_name) 171 | 172 | 173 | class stopped_jobs_key_handle(json_redis_key_handle): 174 | """JSON object with list of sync_job dicts.""" 175 | 176 | def __init__(self, redis_handle): 177 | super().__init__(redis_handle, "irods_ingest_stopped_jobs", "") 178 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/redis_utils.py: -------------------------------------------------------------------------------- 1 | from redis import StrictRedis, ConnectionPool 2 | 3 | redis_connection_pool_map = {} 4 | 5 | 6 | def get_redis(config): 7 | redis_config = config["redis"] 8 | host = redis_config["host"] 9 | port = redis_config["port"] 10 | db = redis_config["db"] 11 | url = "redis://" + host + ":" + str(port) + "/" + str(db) 12 | pool = redis_connection_pool_map.get(url) 13 | if pool is None: 14 | pool = ConnectionPool(host=host, port=port, db=db) 15 | redis_connection_pool_map[url] = pool 16 | 17 | return StrictRedis(connection_pool=pool) 18 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/sync_actions.py: -------------------------------------------------------------------------------- 1 | from . import sync_logging 2 | from .irods import irods_utils 3 | from .redis_key import redis_key_handle 4 | from .redis_utils import get_redis 5 | from .sync_job import get_stopped_jobs_list, sync_job 6 | from .tasks import filesystem_tasks, s3_bucket_tasks 7 | 8 | from os.path import realpath 9 | from uuid import uuid1 10 | import json 11 | import progressbar 12 | import redis_lock 13 | import textwrap 14 | import time 15 | import uuid 16 | 17 | uuid_ = uuid.uuid4().hex 18 | 19 | 20 | def stop_job(job_name, config): 21 | logger = sync_logging.get_sync_logger(config["log"]) 22 | r = get_redis(config) 23 | with redis_lock.Lock(r, "lock:periodic"): 24 | job = sync_job(job_name, r) 25 | if job.cleanup_handle().get_value() is None: 26 | logger.error("job [{0}] does not exist".format(job_name)) 27 | raise Exception("job [{0}] does not exist".format(job_name)) 28 | job.stop() 29 | 30 | 31 | def list_jobs(config): 32 | r = get_redis(config) 33 | with redis_lock.Lock(r, "lock:periodic"): 34 | periodic_jobs = list( 35 | map(lambda job_id: job_id.decode("utf-8"), r.lrange("periodic", 0, -1)) 36 | ) 37 | singlepass_jobs = list( 38 | map(lambda job_id: job_id.decode("utf-8"), r.lrange("singlepass", 0, -1)) 39 | ) 40 | jobs_map = { 41 | "periodic": [sync_job(job_name, r).asdict() for job_name in periodic_jobs], 42 | "singlepass": [ 43 | sync_job(job_name, r).asdict() for job_name in singlepass_jobs 44 | ], 45 | "stopped": get_stopped_jobs_list(r), 46 | } 47 | return jobs_map 48 | 49 | 50 | def monitor_job(job_name, progress, config): 51 | logger = sync_logging.get_sync_logger(config["log"]) 52 | job = sync_job(job_name, get_redis(config)) 53 | if job.cleanup_handle().get_value() is None: 54 | logger.error("job [{0}] does not exist".format(job.name())) 55 | raise Exception("job [{0}] does not exist".format(job.name())) 56 | try: 57 | if not progress: 58 | while not job.done() or job.periodic(): 59 | time.sleep(1) 60 | if job.stopped(): 61 | logger.warning( 62 | f"Job [{job.name()}] was stopped and may not have finished." 63 | ) 64 | failures = job.failures_handle().get_value() 65 | if failures is not None and failures != 0: 66 | return -1 67 | return 0 68 | start_time = job.start_time_handle().get_value() 69 | if start_time is None: 70 | logger.error( 71 | f"Job [{job.name()}] has no start time. Cannot display progress." 72 | ) 73 | return -1 74 | widgets = [ 75 | " [", 76 | progressbar.Variable("timer"), 77 | "] ", 78 | progressbar.Bar(), 79 | " (", 80 | progressbar.ETA(), 81 | ") ", 82 | progressbar.Variable("total"), 83 | " ", 84 | progressbar.Variable("remaining"), 85 | " ", 86 | progressbar.Variable("failed"), 87 | " ", 88 | progressbar.Variable("retried"), 89 | ] 90 | with progressbar.ProgressBar( 91 | max_value=1, widgets=widgets, redirect_stdout=True, redirect_stderr=True 92 | ) as bar: 93 | 94 | def update_pbar(): 95 | job_info = job.asdict() 96 | total_tasks = job_info["total_tasks"] 97 | remaining_tasks = job_info["remaining_tasks"] 98 | if total_tasks == 0: 99 | percentage = 0 100 | else: 101 | percentage = max( 102 | 0, min(1, (total_tasks - remaining_tasks) / total_tasks) 103 | ) 104 | bar.update( 105 | percentage, 106 | timer=job_info["elapsed_time"], 107 | total=total_tasks, 108 | remaining=remaining_tasks, 109 | failed=job_info["failed_tasks"], 110 | retried=job_info["retried_tasks"], 111 | ) 112 | 113 | while not job.done() or job.periodic(): 114 | update_pbar() 115 | time.sleep(1) 116 | if job.stopped(): 117 | logger.warning( 118 | f"Job [{job.name()}] was stopped and may not have finished." 119 | ) 120 | else: 121 | update_pbar() 122 | failures = job.failures_handle().get_value() 123 | if failures is not None and failures != 0: 124 | return -1 125 | else: 126 | return 0 127 | except KeyboardInterrupt: 128 | logger.info(f"KeyboardInterrupt stopped monitoring of job [{job.name()}].") 129 | return 0 130 | 131 | 132 | def start_job(data): 133 | config = data["config"] 134 | logging_config = config["log"] 135 | src_path = data["src_path"] 136 | job_name = data["job_name"] 137 | interval = data["interval"] 138 | restart_queue = data["restart_queue"] 139 | sychronous = data["synchronous"] 140 | progress = data["progress"] 141 | s3_region_name = data["s3_region_name"] 142 | s3_endpoint_domain = data["s3_endpoint_domain"] 143 | s3_keypair = data["s3_keypair"] 144 | s3_multipart_chunksize = data["s3_multipart_chunksize_in_mib"] 145 | logger = sync_logging.get_sync_logger(logging_config) 146 | data_copy = data.copy() 147 | 148 | if s3_keypair is not None: 149 | with open(s3_keypair) as f: 150 | data_copy["s3_access_key"] = f.readline().rstrip() 151 | data_copy["s3_secret_key"] = f.readline().rstrip() 152 | # set source 153 | src_abs = src_path 154 | main_task = s3_bucket_tasks.s3_bucket_main_task 155 | else: 156 | src_abs = realpath(src_path) 157 | main_task = filesystem_tasks.filesystem_main_task 158 | 159 | data_copy["root"] = src_abs 160 | data_copy["path"] = src_abs 161 | 162 | irods_utils.validate_target_collection(data_copy, logger) 163 | 164 | def store_event_handler(data, job): 165 | event_handler = data.get("event_handler") 166 | event_handler_data = data.get("event_handler_data") 167 | event_handler_path = data.get("event_handler_path") 168 | 169 | # investigate -- kubernetes 170 | if ( 171 | event_handler is None 172 | and event_handler_path is not None 173 | and event_handler_data is not None 174 | ): 175 | event_handler = "event_handler" + uuid1().hex 176 | hdlr2 = event_handler_path + "/" + event_handler + ".py" 177 | with open(hdlr2, "w") as f: 178 | f.write(event_handler_data) 179 | cleanup_list = [hdlr2.encode("utf-8")] 180 | data["event_handler"] = event_handler 181 | # if no argument is given, use default event_handler 182 | elif event_handler is None: 183 | # constructing redis_key and putting default event_handler into redis 184 | uuid_ = uuid.uuid4().hex 185 | event_handler_key = redis_key_handle( 186 | r, "custom_event_handler", job.name() + "::" + uuid_ 187 | ) 188 | content_string = textwrap.dedent( 189 | """ 190 | from irods_capability_automated_ingest.core import Core 191 | from irods_capability_automated_ingest.utils import Operation, DeleteMode 192 | class event_handler(Core): 193 | @staticmethod 194 | def operation(session, meta, *args, **options): 195 | return Operation.REGISTER_SYNC 196 | 197 | @staticmethod 198 | def delete_mode(meta): 199 | return DeleteMode.DO_NOT_DELETE""" 200 | ) 201 | event_handler_key.set_value(content_string) 202 | 203 | # putting redis_key into meta map 204 | data_copy["event_handler_key"] = event_handler_key.get_key() 205 | 206 | cleanup_list = [] 207 | else: 208 | # constructing redis_key and putting custom_event_handler into redis 209 | with open(event_handler, "r") as f: 210 | content_string = f.read() 211 | 212 | uuid_ = uuid.uuid4().hex 213 | event_handler_key = redis_key_handle( 214 | r, "custom_event_handler", job.name() + "::" + uuid_ 215 | ) 216 | event_handler_key.set_value(content_string) 217 | 218 | # putting redis_key into meta map 219 | data_copy["event_handler_key"] = event_handler_key.get_key() 220 | 221 | cleanup_list = [] 222 | job.cleanup_handle().set_value(json.dumps(cleanup_list)) 223 | 224 | r = get_redis(config) 225 | job = sync_job.from_meta(data_copy) 226 | with redis_lock.Lock(r, "lock:periodic"): 227 | if job.cleanup_handle().get_value() is not None: 228 | logger.error("job {0} already exists".format(job_name)) 229 | raise Exception("job {0} already exists".format(job_name)) 230 | 231 | store_event_handler(data_copy, job) 232 | 233 | if interval is not None: 234 | r.rpush("periodic", job_name.encode("utf-8")) 235 | 236 | main_task.s(data_copy).apply_async(queue=restart_queue, task_id=job_name) 237 | else: 238 | r.rpush("singlepass", job_name.encode("utf-8")) 239 | if not sychronous: 240 | main_task.s(data_copy).apply_async(queue=restart_queue) 241 | else: 242 | res = main_task.s(data_copy).apply() 243 | if res.failed(): 244 | print(res.traceback) 245 | job.cleanup() 246 | return -1 247 | else: 248 | return monitor_job(job_name, progress, config) 249 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/sync_job.py: -------------------------------------------------------------------------------- 1 | from . import redis_key 2 | from .celery import app 3 | from .redis_utils import get_redis 4 | 5 | import datetime 6 | import json 7 | import os 8 | import progressbar 9 | import time 10 | 11 | 12 | def add_stopped_job(redis_handle, stopped_job_dict): 13 | """Add the sync_job dict to the JSON array of stopped jobs tracked in the Redis database.""" 14 | stopped_jobs_handle = redis_key.stopped_jobs_key_handle(redis_handle) 15 | stopped_jobs_value = stopped_jobs_handle.get_value() 16 | if stopped_jobs_value is None: 17 | stopped_jobs_list = [] 18 | else: 19 | stopped_jobs_list = json.loads(stopped_jobs_value.decode("utf-8")) 20 | stopped_jobs_list.append(stopped_job_dict) 21 | # TODO(#297): Is it really the caller's responsibility to dump to a string? 22 | # stopped_jobs_handle is a json_redis_key_handle, so it ought to handle this for us... 23 | stopped_jobs_handle.set_value(json.dumps(stopped_jobs_list)) 24 | 25 | 26 | def get_stopped_jobs_list(redis_handle): 27 | """Get the JSON array of stopped jobs tracked in the Redis database.""" 28 | stopped_jobs_value = redis_key.stopped_jobs_key_handle(redis_handle).get_value() 29 | if stopped_jobs_value is None: 30 | return [] 31 | return json.loads(stopped_jobs_value.decode("utf-8")) 32 | 33 | 34 | class sync_job(object): 35 | def __init__(self, job_name, redis_handle): 36 | self.job_name = job_name 37 | self.r = redis_handle 38 | 39 | @classmethod 40 | def from_meta(cls, meta): 41 | return cls(meta["job_name"], get_redis(meta["config"])) 42 | 43 | def name(self): 44 | return self.job_name 45 | 46 | def count_handle(self): 47 | return redis_key.count_key_handle(self.r, self.job_name) 48 | 49 | def dequeue_handle(self): 50 | return redis_key.dequeue_key_handle(self.r, self.job_name) 51 | 52 | def tasks_handle(self): 53 | return redis_key.tasks_key_handle(self.r, self.job_name) 54 | 55 | def failures_handle(self): 56 | return redis_key.failures_key_handle(self.r, self.job_name) 57 | 58 | def retries_handle(self): 59 | return redis_key.retries_key_handle(self.r, self.job_name) 60 | 61 | def stop_handle(self): 62 | return redis_key.stop_key_handle(self.r, self.job_name) 63 | 64 | def cleanup_handle(self): 65 | return redis_key.cleanup_key_handle(self.r, self.job_name) 66 | 67 | def done(self): 68 | task_count = self.tasks_handle().get_value() 69 | return task_count is None or task_count == 0 70 | 71 | def periodic(self): 72 | periodic_list = self.r.lrange("periodic", 0, -1) 73 | return self.job_name.encode("utf-8") in periodic_list 74 | 75 | def cleanup(self): 76 | # hdlr = get_with_key(r, cleanup_key, job_name, lambda bs: json.loads(bs.decode("utf-8"))) 77 | cleanup_list = self.cleanup_handle().get_value() 78 | if cleanup_list is not None: 79 | file_list = json.loads(cleanup_list.decode("utf-8")) 80 | for f in file_list: 81 | os.remove(f) 82 | 83 | if self.periodic(): 84 | self.r.lrem("periodic", 1, self.job_name) 85 | else: 86 | self.r.lrem("singlepass", 1, self.job_name) 87 | 88 | self.cleanup_handle().reset() 89 | 90 | def reset(self): 91 | self.count_handle().reset() 92 | self.dequeue_handle().reset() 93 | self.tasks_handle().reset() 94 | self.failures_handle().reset() 95 | self.retries_handle().reset() 96 | self.start_time_handle().reset() 97 | 98 | def interrupt(self, cli=True, terminate=True): 99 | self.stop_handle().set_value("") 100 | queued_tasks = list( 101 | map(lambda x: x.decode("utf-8"), self.count_handle().lrange(0, -1)) 102 | ) 103 | dequeued_tasks = set( 104 | map(lambda x: x.decode("utf-8"), self.dequeue_handle().lrange(0, -1)) 105 | ) 106 | 107 | tasks = [item for item in queued_tasks if item not in dequeued_tasks] 108 | if cli: 109 | tasks = progressbar.progressbar(tasks, max_value=len(tasks)) 110 | 111 | # stop active tasks for this job 112 | for task in tasks: 113 | app.control.revoke(task, terminate=terminate) 114 | 115 | # stop restart job 116 | app.control.revoke(self.job_name) 117 | self.stop_handle().reset() 118 | 119 | def start_time_handle(self): 120 | return redis_key.float_redis_key_handle( 121 | self.r, "irods_ingest_job_start_time", self.job_name 122 | ) 123 | 124 | def stop(self): 125 | add_stopped_job(self.r, self.asdict()) 126 | self.interrupt() 127 | self.cleanup() 128 | self.reset() 129 | 130 | def stopped(self): 131 | stopped_jobs_list = get_stopped_jobs_list(self.r) 132 | for job in stopped_jobs_list: 133 | if self.job_name == job["job_name"]: 134 | return True 135 | return False 136 | 137 | def asdict(self): 138 | start_time = self.start_time_handle().get_value() or 0 139 | formatted_start_time = datetime.datetime.fromtimestamp( 140 | start_time, tz=datetime.timezone.utc 141 | ).isoformat(timespec="milliseconds") 142 | elapsed_time = time.time() - start_time if start_time else 0 143 | elapsed_time_str = str(datetime.timedelta(milliseconds=elapsed_time * 1000)) 144 | tasks = int(self.tasks_handle().get_value() or 0) 145 | total = self.count_handle().llen() 146 | failures = int(self.failures_handle().get_value() or 0) 147 | retries = int(self.retries_handle().get_value() or 0) 148 | return { 149 | "job_name": self.job_name, 150 | "total_tasks": total, 151 | "remaining_tasks": tasks, 152 | "failed_tasks": failures, 153 | "retried_tasks": retries, 154 | "elapsed_time": elapsed_time_str, 155 | "start_time": formatted_start_time, 156 | } 157 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/sync_logging.py: -------------------------------------------------------------------------------- 1 | import structlog 2 | import logging 3 | import logging.handlers 4 | from structlog import wrap_logger 5 | import datetime 6 | import time 7 | import sys 8 | 9 | irods_sync_logger = "irods_sync" 10 | 11 | 12 | def timestamper(logger, log_method, event_dict): 13 | utc_offset_sec = time.altzone if time.localtime().tm_isdst else time.timezone 14 | utc_offset = datetime.timedelta(seconds=-utc_offset_sec) 15 | event_dict["@timestamp"] = ( 16 | datetime.datetime.now() 17 | .replace(tzinfo=datetime.timezone(offset=utc_offset)) 18 | .isoformat(timespec="milliseconds") 19 | ) 20 | return event_dict 21 | 22 | 23 | logger_map = {} 24 | 25 | 26 | def create_sync_logger(logging_config): 27 | log_file = logging_config["filename"] 28 | when = logging_config["when"] 29 | interval = logging_config["interval"] 30 | level = logging_config["level"] 31 | 32 | logger = logging.getLogger( 33 | irods_sync_logger + "/" + get_sync_logger_key(logging_config) 34 | ) 35 | logger.propagate = False 36 | 37 | # logger = get_task_logger(irods_sync_logger) 38 | 39 | if level is not None: 40 | logger.setLevel(logging.getLevelName(level)) 41 | 42 | if log_file is not None: 43 | if when is not None: 44 | handler = logging.handlers.TimedRotatingFileHandler( 45 | log_file, when=when, interval=interval 46 | ) 47 | else: 48 | handler = logging.FileHandler(log_file) 49 | else: 50 | handler = logging.StreamHandler(sys.stdout) 51 | logger.addHandler(handler) 52 | 53 | return wrap_logger( 54 | logger, 55 | processors=[ 56 | structlog.stdlib.filter_by_level, 57 | structlog.stdlib.add_logger_name, 58 | structlog.stdlib.add_log_level, 59 | timestamper, 60 | structlog.processors.JSONRenderer(), 61 | ], 62 | ) 63 | 64 | 65 | def get_sync_logger(logging_config): 66 | key = get_sync_logger_key(logging_config) 67 | logger = logger_map.get(key) 68 | if logger is None: 69 | logger = create_sync_logger(logging_config) 70 | logger_map[key] = logger 71 | 72 | return logger 73 | 74 | 75 | def get_sync_logger_key(logging_config): 76 | filename = logging_config["filename"] 77 | if filename is None: 78 | filename = "" 79 | level = logging_config["level"] 80 | if level is None: 81 | level = "" 82 | return filename + "/" + level 83 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/task_queue.py: -------------------------------------------------------------------------------- 1 | from . import sync_logging 2 | from .sync_job import sync_job 3 | from .custom_event_handler import custom_event_handler 4 | from uuid import uuid1 5 | 6 | 7 | class task_queue(object): 8 | def __init__(self, name): 9 | self.name = name 10 | 11 | def name(self): 12 | return self.name 13 | 14 | def add(self, task, meta): 15 | logger = sync_logging.get_sync_logger(meta["config"]["log"]) 16 | job = sync_job.from_meta(meta) 17 | if job.stop_handle().get_value() is None: 18 | logger.info( 19 | "incr_job_name", 20 | task=meta["task"], 21 | path=meta["path"], 22 | job_name=job.name(), 23 | ) 24 | job.tasks_handle().incr() 25 | task_id = str(uuid1()) 26 | timeout = custom_event_handler(meta).timeout() 27 | job.count_handle().rpush(task_id) 28 | task.s(meta).apply_async( 29 | queue=self.name(), task_id=task_id, soft_time_limit=timeout 30 | ) 31 | else: 32 | # A job by this name is currently being stopped 33 | logger.info( 34 | "async_job_name_stopping", 35 | task=meta["task"], 36 | path=meta["path"], 37 | job_name=job.name(), 38 | ) 39 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/tasks/__init__.py -------------------------------------------------------------------------------- /irods_capability_automated_ingest/tasks/delete_tasks.py: -------------------------------------------------------------------------------- 1 | from .. import sync_logging 2 | from ..celery import app, RestartTask 3 | from ..custom_event_handler import custom_event_handler 4 | from ..irods import irods_utils 5 | from ..utils import enqueue_task 6 | from .irods_task import IrodsTask 7 | 8 | from irods.exception import ( 9 | CollectionDoesNotExist, 10 | DataObjectDoesNotExist, 11 | PycommandsException, 12 | ) 13 | 14 | 15 | def schedule_collections_for_removal(meta, list_of_collections_to_delete): 16 | if 0 == len(list_of_collections_to_delete): 17 | # This could be considered an error, but let's just treat it as a no-op. 18 | return 19 | meta_for_task = meta.copy() 20 | meta_for_task["queue_name"] = meta["path_queue"] 21 | meta_for_task["task"] = "delete_collection" 22 | for collection in list_of_collections_to_delete: 23 | meta_for_task["path"] = collection.path 24 | meta_for_task["target_collection"] = collection.path 25 | enqueue_task(delete_collection, meta_for_task) 26 | 27 | 28 | def schedule_data_objects_for_removal(meta, list_of_objects_to_delete): 29 | if 0 == len(list_of_objects_to_delete): 30 | # This could be considered an error, but let's just treat it as a no-op. 31 | return 32 | meta_for_task = meta.copy() 33 | meta_for_task["queue_name"] = meta["file_queue"] 34 | meta_for_task["task"] = "delete_data_objects" 35 | removal_chunk = [] 36 | chunk_size = meta_for_task.get("files_per_task", 50) 37 | for obj in list_of_objects_to_delete: 38 | removal_chunk.append(obj.path) 39 | if len(removal_chunk) == chunk_size: 40 | meta_for_task["data_objects_to_delete"] = removal_chunk 41 | enqueue_task(delete_data_objects, meta_for_task) 42 | removal_chunk = [] 43 | if len(removal_chunk) > 0: 44 | meta_for_task["data_objects_to_delete"] = removal_chunk 45 | enqueue_task(delete_data_objects, meta_for_task) 46 | removal_chunk = [] 47 | 48 | 49 | @app.task(base=RestartTask) 50 | def delete_collection_task(meta): 51 | logical_path = meta["target_collection"] 52 | meta_for_task = meta.copy() 53 | meta_for_task["queue_name"] = meta["path_queue"] 54 | meta_for_task["task"] = "delete_collection" 55 | meta_for_task["path"] = logical_path 56 | meta_for_task["target_collection"] = logical_path 57 | enqueue_task(delete_collection, meta_for_task) 58 | 59 | 60 | @app.task(bind=True, base=IrodsTask) 61 | def delete_collection(self, meta): 62 | config = meta["config"] 63 | logging_config = config["log"] 64 | logger = sync_logging.get_sync_logger(logging_config) 65 | event_handler = custom_event_handler(meta) 66 | logical_path = meta["target_collection"] 67 | session = irods_utils.irods_session(event_handler.get_module(), meta, logger) 68 | meta_for_task = meta.copy() 69 | meta_for_task["task"] = "delete_collection" 70 | try: 71 | target_collection = session.collections.get(logical_path) 72 | except CollectionDoesNotExist: 73 | # Print an error message here because the exception doesn't tell you what doesn't exist. 74 | logger.error(f"Collection [{logical_path}] does not exist.") 75 | raise 76 | if 0 == len(target_collection.data_objects) and 0 == len( 77 | target_collection.subcollections 78 | ): 79 | logger.debug(f"Removing empty collection [{target_collection.path}].") 80 | meta_for_task["target"] = target_collection.path 81 | irods_utils.delete_collection(event_handler.get_module(), meta_for_task) 82 | return 83 | if meta.get("only_delete_collection"): 84 | logger.info( 85 | f"Collection [{logical_path}] could not be removed because it is not empty." 86 | ) 87 | return 88 | meta_for_task["delete_empty_parent_collection"] = target_collection.path 89 | # The subcollections should be scheduled for removal before the data objects because there could be deep 90 | # subcollections with many data objects. 91 | schedule_collections_for_removal(meta_for_task, target_collection.subcollections) 92 | # This instructs each task which deletes data objects to attempt to remove the parent collection. If this is not 93 | # done, the parent collection could remain after everything else has been removed in the parent collection. 94 | schedule_data_objects_for_removal(meta_for_task, target_collection.data_objects) 95 | # This collection does not schedule itself for removal, nor does it attempt to synchronously remove itself here. 96 | # This is because removing the subcollections and data objects are in asynchronous tasks which might take a very 97 | # long time to complete. As such, removal of the parent collection has been delegated to those tasks. The last task 98 | # to complete should remove the parent collection, whether it's a data object removal or a subcollection removal. 99 | 100 | 101 | @app.task(bind=True, base=IrodsTask) 102 | def delete_data_objects(self, meta): 103 | config = meta["config"] 104 | logging_config = config["log"] 105 | logger = sync_logging.get_sync_logger(logging_config) 106 | meta_for_task = meta.copy() 107 | meta_for_task["task"] = "delete_data_object" 108 | logical_paths = meta_for_task["data_objects_to_delete"] 109 | if 0 == len(logical_paths): 110 | logger.warning("No data objects specified for removal - nothing to do.") 111 | return 112 | event_handler = custom_event_handler(meta) 113 | for logical_path in logical_paths: 114 | try: 115 | meta_for_task["target"] = logical_path 116 | irods_utils.delete_data_object(event_handler.get_module(), meta_for_task) 117 | except DataObjectDoesNotExist: 118 | logger.error( 119 | f"Data object [{logical_path}] does not exist, so it cannot be deleted." 120 | ) 121 | continue 122 | except PycommandsException as e: 123 | logger.error( 124 | f"Exception occurred while removing data object [{logical_path}]: {e}" 125 | ) 126 | continue 127 | # Synchronously attempt to delete the parent collection. Another task may have already done this depending on the 128 | # order of completion, or the collection may not be empty yet because there are more things being deleted. The 129 | # parent collection will be deleted either by a data object removal task or a subcollection removal task. 130 | parent_collection_path = meta.get("delete_empty_parent_collection") 131 | if parent_collection_path: 132 | logger.debug( 133 | f"Attempting to delete parent collection [{parent_collection_path}]." 134 | ) 135 | meta_for_delete = meta.copy() 136 | meta_for_delete["target"] = parent_collection_path 137 | try: 138 | irods_utils.delete_collection(event_handler.get_module(), meta_for_delete) 139 | except CollectionDoesNotExist: 140 | logger.warning( 141 | f"Failed to delete parent collection [{parent_collection_path}]: it no longer exists." 142 | ) 143 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/tasks/irods_task.py: -------------------------------------------------------------------------------- 1 | from .. import custom_event_handler, sync_logging 2 | from ..celery import app 3 | from ..sync_job import sync_job 4 | 5 | import traceback 6 | 7 | 8 | class IrodsTask(app.Task): 9 | def on_failure(self, exc, task_id, args, kwargs, einfo): 10 | meta = args[0] 11 | config = meta["config"] 12 | job = sync_job.from_meta(meta) 13 | logger = sync_logging.get_sync_logger(config["log"]) 14 | logger.error( 15 | "failed_task", 16 | task=meta["task"], 17 | path=meta["path"], 18 | job_name=job.name(), 19 | task_id=task_id, 20 | exc=exc, 21 | einfo=einfo, 22 | traceback=traceback.extract_tb(exc.__traceback__), 23 | ) 24 | job.failures_handle().incr() 25 | 26 | def on_retry(self, exc, task_id, args, kwargs, einfo): 27 | meta = args[0] 28 | config = meta["config"] 29 | job = sync_job.from_meta(meta) 30 | logger = sync_logging.get_sync_logger(config["log"]) 31 | logger.warning( 32 | "retry_task", 33 | task=meta["task"], 34 | path=meta["path"], 35 | job_name=job.name(), 36 | task_id=task_id, 37 | exc=exc, 38 | einfo=einfo, 39 | traceback=traceback.extract_tb(exc.__traceback__), 40 | ) 41 | job.retries_handle().incr() 42 | 43 | def on_success(self, retval, task_id, args, kwargs): 44 | meta = args[0] 45 | config = meta["config"] 46 | logger = sync_logging.get_sync_logger(config["log"]) 47 | job_name = meta["job_name"] 48 | logger.info( 49 | "succeeded_task", 50 | task=meta["task"], 51 | path=meta["path"], 52 | job_name=job_name, 53 | task_id=task_id, 54 | retval=retval, 55 | ) 56 | 57 | def after_return(self, status, retval, task_id, args, kwargs, einfo): 58 | meta = args[0] 59 | config = meta["config"] 60 | job = sync_job.from_meta(meta) 61 | logger = sync_logging.get_sync_logger(config["log"]) 62 | logger.info( 63 | "decr_job_name", 64 | task=meta["task"], 65 | path=meta["path"], 66 | job_name=job.name(), 67 | task_id=task_id, 68 | retval=retval, 69 | ) 70 | 71 | done = job.tasks_handle().decr() == 0 and not job.periodic() 72 | if done: 73 | job.cleanup() 74 | 75 | job.dequeue_handle().rpush(task_id) 76 | 77 | if done: 78 | event_handler = custom_event_handler.custom_event_handler(meta) 79 | if event_handler.hasattr("post_job"): 80 | module = event_handler.get_module() 81 | module.post_job(module, logger, meta) 82 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/tasks/s3_bucket_tasks.py: -------------------------------------------------------------------------------- 1 | from .. import sync_logging, utils 2 | from ..celery import app, RestartTask 3 | from ..char_map_util import translate_path 4 | from ..custom_event_handler import custom_event_handler 5 | from ..irods import s3_bucket 6 | from ..redis_key import sync_time_key_handle 7 | from ..redis_utils import get_redis 8 | from ..sync_job import sync_job 9 | from ..utils import enqueue_task, is_unicode_encode_error_path 10 | from .irods_task import IrodsTask 11 | 12 | from billiard import current_process 13 | from minio import Minio 14 | 15 | import base64 16 | import datetime 17 | import os 18 | import re 19 | import redis_lock 20 | import stat 21 | import time 22 | import traceback 23 | 24 | 25 | @app.task(base=RestartTask) 26 | def s3_bucket_main_task(meta): 27 | # Start periodic job on restart_queue 28 | job_name = meta["job_name"] 29 | restart_queue = meta["restart_queue"] 30 | interval = meta["interval"] 31 | if interval is not None: 32 | s3_bucket_main_task.s(meta).apply_async( 33 | task_id=job_name, queue=restart_queue, countdown=interval 34 | ) 35 | 36 | # Continue with singlepass job 37 | config = meta["config"] 38 | logging_config = config["log"] 39 | logger = sync_logging.get_sync_logger(logging_config) 40 | try: 41 | event_handler = custom_event_handler(meta) 42 | if event_handler.hasattr("pre_job"): 43 | module = event_handler.get_module() 44 | module.pre_job(module, logger, meta) 45 | 46 | logger.info("***************** restart *****************") 47 | job = sync_job.from_meta(meta) 48 | if not job.periodic() or job.done(): 49 | logger.info( 50 | "no tasks for this job and worker handling this task is not busy" 51 | ) 52 | 53 | job.reset() 54 | job.start_time_handle().set_value(time.time()) 55 | meta = meta.copy() 56 | meta["task"] = "s3_bucket_sync_path" 57 | meta["queue_name"] = meta["path_queue"] 58 | enqueue_task(s3_bucket_sync_path, meta) 59 | else: 60 | logger.info("tasks exist for this job or worker handling this task is busy") 61 | 62 | except OSError as err: 63 | logger.warning( 64 | "Warning: " + str(err), traceback=traceback.extract_tb(err.__traceback__) 65 | ) 66 | 67 | except Exception as err: 68 | logger.error( 69 | "Unexpected error: " + str(err), 70 | traceback=traceback.extract_tb(err.__traceback__), 71 | ) 72 | raise 73 | 74 | 75 | @app.task(bind=True, base=IrodsTask) 76 | def s3_bucket_sync_path(self, meta): 77 | path = meta["path"] 78 | config = meta["config"] 79 | logging_config = config["log"] 80 | 81 | logger = sync_logging.get_sync_logger(logging_config) 82 | 83 | event_handler = custom_event_handler(meta) 84 | 85 | proxy_url = meta.get("s3_proxy_url") 86 | if proxy_url is None: 87 | httpClient = None 88 | else: 89 | import urllib3 90 | 91 | httpClient = urllib3.ProxyManager( 92 | proxy_url, 93 | timeout=urllib3.Timeout.DEFAULT_TIMEOUT, 94 | cert_reqs="CERT_REQUIRED", 95 | retries=urllib3.Retry( 96 | total=5, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504] 97 | ), 98 | ) 99 | endpoint_domain = meta.get("s3_endpoint_domain") 100 | s3_access_key = meta.get("s3_access_key") 101 | s3_secret_key = meta.get("s3_secret_key") 102 | s3_secure_connection = meta.get("s3_secure_connection", True) 103 | client = Minio( 104 | endpoint_domain, 105 | access_key=s3_access_key, 106 | secret_key=s3_secret_key, 107 | secure=s3_secure_connection, 108 | http_client=httpClient, 109 | ) 110 | 111 | try: 112 | logger.info("walk dir", path=path) 113 | # TODO: Remove shadowing here - use a different name 114 | meta = meta.copy() 115 | meta["task"] = "s3_bucket_sync_dir" 116 | chunk = {} 117 | 118 | # Check to see whether the provided operation and delete_mode are compatible. 119 | delete_mode = event_handler.delete_mode() 120 | logger.debug(f"delete_mode: {delete_mode}") 121 | # TODO(#282): S3 bucket syncs do not support DeleteMode (yet) 122 | if utils.DeleteMode.DO_NOT_DELETE != delete_mode: 123 | raise RuntimeError( 124 | f"S3 bucket syncs do not support DeleteMode [{delete_mode}]. Only DeleteMode.DO_NOT_DELETE is supported." 125 | ) 126 | 127 | path_list = meta["path"].lstrip("/").split("/", 1) 128 | bucket_name = path_list[0] 129 | if len(path_list) == 1: 130 | prefix = "" 131 | else: 132 | prefix = path_list[1] 133 | meta["root"] = bucket_name 134 | meta["s3_prefix"] = prefix 135 | itr = client.list_objects(bucket_name, prefix=prefix, recursive=True) 136 | 137 | if meta["profile"]: 138 | profile_log = config.get("profile") 139 | profile_logger = sync_logging.get_sync_logger(profile_log) 140 | task_id = self.request.id 141 | 142 | profile_logger.info( 143 | "list_dir_prerun", 144 | event_id=task_id + ":list_dir", 145 | event_name="list_dir", 146 | hostname=self.request.hostname, 147 | index=current_process().index, 148 | ) 149 | itr = list(itr) 150 | if meta["profile"]: 151 | profile_logger.info( 152 | "list_dir_postrun", 153 | event_id=task_id + ":list_dir", 154 | event_name="list_dir", 155 | hostname=self.request.hostname, 156 | index=current_process().index, 157 | ) 158 | 159 | exclude_file_name = meta["exclude_file_name"] 160 | exclude_directory_name = meta["exclude_directory_name"] 161 | file_regex = [re.compile(r) for r in exclude_file_name] 162 | dir_regex = [re.compile(r) for r in exclude_directory_name] 163 | 164 | for obj in itr: 165 | obj_stats = {} 166 | 167 | full_path = obj.object_name 168 | full_path = obj.object_name 169 | 170 | if obj.object_name.endswith("/"): 171 | # TODO: Not sure what this means -- skip it? 172 | # chunk[full_path] = {} 173 | continue 174 | 175 | # add object stat dict to the chunk dict 176 | obj_stats = { 177 | "is_link": False, 178 | "is_socket": False, 179 | "mtime": obj.last_modified.timestamp(), 180 | "ctime": obj.last_modified.timestamp(), 181 | "size": obj.size, 182 | } 183 | chunk[full_path] = obj_stats 184 | 185 | # Launch async job when enough objects are ready to be sync'd 186 | files_per_task = meta.get("files_per_task") 187 | if len(chunk) >= files_per_task: 188 | sync_files_meta = meta.copy() 189 | sync_files_meta["chunk"] = chunk 190 | sync_files_meta["queue_name"] = meta["file_queue"] 191 | enqueue_task(s3_bucket_sync_files, sync_files_meta) 192 | chunk.clear() 193 | 194 | if len(chunk) > 0: 195 | sync_files_meta = meta.copy() 196 | sync_files_meta["chunk"] = chunk 197 | sync_files_meta["queue_name"] = meta["file_queue"] 198 | enqueue_task(s3_bucket_sync_files, sync_files_meta) 199 | chunk.clear() 200 | 201 | except Exception as err: 202 | event_handler = custom_event_handler(meta) 203 | retry_countdown = event_handler.delay(self.request.retries + 1) 204 | max_retries = event_handler.max_retries() 205 | raise self.retry(max_retries=max_retries, exc=err, countdown=retry_countdown) 206 | 207 | 208 | @app.task(bind=True, base=IrodsTask) 209 | def s3_bucket_sync_dir(self, meta_input): 210 | meta = meta_input.copy() 211 | meta["entry_type"] = "dir" 212 | s3_bucket_sync_entry( 213 | self, meta, s3_bucket.sync_data_from_dir, s3_bucket.sync_metadata_from_dir 214 | ) 215 | 216 | 217 | @app.task(bind=True, base=IrodsTask) 218 | def s3_bucket_sync_files(self, meta_input): 219 | meta = meta_input.copy() 220 | meta["entry_type"] = "file" 221 | meta["task"] = "sync_file" 222 | for path, obj_stats in meta["chunk"].items(): 223 | meta["path"] = path 224 | meta["is_empty_dir"] = obj_stats.get("is_empty_dir") 225 | meta["is_link"] = obj_stats.get("is_link") 226 | meta["is_socket"] = obj_stats.get("is_socket") 227 | meta["mtime"] = obj_stats.get("mtime") 228 | meta["ctime"] = obj_stats.get("ctime") 229 | meta["size"] = obj_stats.get("size") 230 | s3_bucket_sync_entry( 231 | self, meta, s3_bucket.sync_data_from_file, s3_bucket.sync_metadata_from_file 232 | ) 233 | 234 | 235 | def s3_bucket_sync_entry(self, meta_input, datafunc, metafunc): 236 | meta = meta_input.copy() 237 | 238 | path = meta["path"] 239 | target = meta["target"] 240 | config = meta["config"] 241 | logging_config = config["log"] 242 | logger = sync_logging.get_sync_logger(logging_config) 243 | 244 | entry_type = meta["entry_type"] 245 | 246 | event_handler = custom_event_handler(meta) 247 | max_retries = event_handler.max_retries() 248 | 249 | lock = None 250 | 251 | logger.info("synchronizing " + entry_type + ". path = " + path) 252 | 253 | character_map = getattr(event_handler.get_module(), "character_map", None) 254 | path_requires_UnicodeEncodeError_handling = is_unicode_encode_error_path(path) 255 | 256 | # TODO: Pull out this logic into some functions 257 | if path_requires_UnicodeEncodeError_handling or character_map is not None: 258 | abspath = os.path.abspath(path) 259 | utf8_escaped_abspath = abspath.encode("utf8", "surrogateescape") 260 | b64_path_str = base64.b64encode(utf8_escaped_abspath) 261 | 262 | if path_requires_UnicodeEncodeError_handling: 263 | path = os.path.dirname(abspath) 264 | unicode_error_filename = "irods_UnicodeEncodeError_" + str( 265 | b64_path_str.decode("utf8") 266 | ) 267 | logger.warning( 268 | "s3_bucket_sync_entry raised UnicodeEncodeError while syncing path:" 269 | + str(utf8_escaped_abspath) 270 | ) 271 | meta["path"] = path 272 | meta["b64_path_str"] = b64_path_str 273 | meta["b64_reason"] = "UnicodeEncodeError" 274 | meta["unicode_error_filename"] = unicode_error_filename 275 | sync_key = str(b64_path_str.decode("utf8")) + ":" + target 276 | else: 277 | sync_key = path + ":" + target 278 | 279 | try: 280 | r = get_redis(config) 281 | lock = redis_lock.Lock(r, "sync_" + entry_type + ":" + sync_key) 282 | lock.acquire() 283 | 284 | sync_time_handle = sync_time_key_handle(r, sync_key) 285 | ignore_redis_cache = meta.get("ignore_cache", False) 286 | sync_time = None if ignore_redis_cache else sync_time_handle.get_value() 287 | 288 | mtime = meta["mtime"] 289 | ctime = meta["ctime"] 290 | 291 | if sync_time is not None and mtime < sync_time and ctime < sync_time: 292 | logger.info( 293 | "succeeded_" + entry_type + "_has_not_changed", 294 | task=meta["task"], 295 | path=path, 296 | ) 297 | return 298 | 299 | t = datetime.datetime.now().timestamp() 300 | logger.info( 301 | "synchronizing " + entry_type, path=path, t0=sync_time, t=t, ctime=ctime 302 | ) 303 | meta2 = meta.copy() 304 | if path == meta["root"]: 305 | if path_requires_UnicodeEncodeError_handling: 306 | # TODO(#250): This may not work on Windows... 307 | target2 = os.path.join(target, meta["unicode_error_filename"]) 308 | else: 309 | target2 = target 310 | else: 311 | # Strip prefix from S3 path 312 | prefix = meta["s3_prefix"] 313 | reg_path = path[path.index(prefix) + len(prefix) :].strip("/") 314 | # Construct S3 "logical path" 315 | target2 = "/".join([meta["target"], reg_path]) 316 | # Construct S3 "physical path" as: /bucket/objectname 317 | meta2["path"] = f"/{meta['root']}/{path}" 318 | 319 | # If the event handler has a character_map function, it should have returned a 320 | # structure (either a dict or a list/tuple of key-value tuples) to be used for 321 | # instantiating a collections.OrderedDict object. This object will dictate how 322 | # the logical path's characters are remapped. The re-mapping is performed 323 | # independently for each path element of the collection hierarchy. 324 | 325 | if not path_requires_UnicodeEncodeError_handling and character_map is not None: 326 | translated_path = translate_path(target2, character_map()) 327 | # arrange for AVU to be attached only when logical name changes 328 | if translated_path != target2: 329 | target2 = translated_path 330 | meta2["b64_reason"] = "character_map" 331 | meta2["b64_path_str_charmap"] = b64_path_str 332 | 333 | meta2["target"] = target2 334 | 335 | if sync_time is None or mtime >= sync_time: 336 | datafunc(event_handler.get_module(), meta2, logger, True) 337 | logger.info("succeeded", task=meta["task"], path=path) 338 | else: 339 | metafunc(event_handler.get_module(), meta2, logger) 340 | logger.info("succeeded_metadata_only", task=meta["task"], path=path) 341 | sync_time_handle.set_value(str(t)) 342 | except Exception as err: 343 | event_handler = custom_event_handler(meta) 344 | retry_countdown = event_handler.delay(self.request.retries + 1) 345 | raise self.retry(max_retries=max_retries, exc=err, countdown=retry_countdown) 346 | finally: 347 | if lock is not None: 348 | lock.release() 349 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/test/__init__.py -------------------------------------------------------------------------------- /irods_capability_automated_ingest/test/test_lib.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from irods.session import iRODSSession 4 | 5 | from irods_capability_automated_ingest.redis_utils import get_redis 6 | 7 | # This is a global in order to take advantage of "caching" the Redis configuration. 8 | # Modify get_redis_config if changes are needed. 9 | redis_config = {} 10 | 11 | 12 | # TODO(#286): Derive from the environment? 13 | def get_redis_config(host="redis", port=6379, db=0): 14 | global redis_config 15 | if redis_config: 16 | return redis_config 17 | redis_config = {"redis": {"host": host, "port": port, "db": db}} 18 | return redis_config 19 | 20 | 21 | def clear_redis(): 22 | get_redis(get_redis_config()).flushdb() 23 | 24 | 25 | def get_test_irods_client_environment_dict(): 26 | # TODO(#286): Derive from the environment? 27 | return { 28 | "host": os.environ.get("IRODS_HOST"), 29 | "port": os.environ.get("IRODS_PORT"), 30 | "user": os.environ.get("IRODS_USER_NAME"), 31 | "zone": os.environ.get("IRODS_ZONE_NAME"), 32 | "password": os.environ.get("IRODS_PASSWORD"), 33 | } 34 | 35 | 36 | def irmtrash(): 37 | # TODO(irods/python-irodsclient#182): Needs irmtrash endpoint 38 | with iRODSSession(**get_test_irods_client_environment_dict()) as session: 39 | rods_trash_path = "/".join( 40 | ["", session.zone, "trash", "home", session.username] 41 | ) 42 | rods_trash_coll = session.collections.get(rods_trash_path) 43 | for coll in rods_trash_coll.subcollections: 44 | delete_collection_if_exists(coll.path, recurse=True, force=True) 45 | 46 | 47 | def delete_collection_if_exists(coll, recurse=True, force=False): 48 | with iRODSSession(**get_test_irods_client_environment_dict()) as session: 49 | if session.collections.exists(coll): 50 | session.collections.remove(coll, recurse=recurse, force=force) 51 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/test/test_s3_bucket_scan.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import io 4 | import os 5 | import signal 6 | import shutil 7 | import subprocess 8 | import tempfile 9 | import textwrap 10 | import time 11 | 12 | from irods.data_object import irods_dirname, irods_basename 13 | from irods.exception import CollectionDoesNotExist 14 | from irods.meta import iRODSMeta 15 | from irods.models import Collection, DataObject 16 | from irods.session import iRODSSession 17 | 18 | from irods_capability_automated_ingest.celery import app 19 | from irods_capability_automated_ingest.redis_utils import get_redis 20 | from irods_capability_automated_ingest.sync_job import sync_job 21 | from irods_capability_automated_ingest.utils import DeleteMode, Operation 22 | import irods_capability_automated_ingest.examples 23 | 24 | from minio import Minio 25 | 26 | from . import test_lib 27 | 28 | # TODO(#286): Derive from the environment? 29 | # This must be set as an environment variable in order for the Celery workers to communicate with the broker. 30 | # Update this value if the hostname, port, or database for the Redis service needs to change. 31 | os.environ["CELERY_BROKER_URL"] = "redis://redis:6379/0" 32 | 33 | 34 | def start_workers(n=2, args=[]): 35 | if not args: 36 | args = ["-l", "info", "-Q", "restart,path,file"] 37 | workers = subprocess.Popen( 38 | [ 39 | "celery", 40 | "-A", 41 | "irods_capability_automated_ingest", 42 | "worker", 43 | "-c", 44 | str(n), 45 | # This option is needed because the worker coordination takes too long for running the tests between 46 | # standing up the workers and their being ready to execute tasks. 47 | "--without-mingle", 48 | ] 49 | + args 50 | ) 51 | return workers 52 | 53 | 54 | def wait_for_job_to_finish(workers, job_name, timeout=60): 55 | r = get_redis(test_lib.get_redis_config()) 56 | t0 = time.time() 57 | while timeout is None or time.time() - t0 < timeout: 58 | restart = r.llen("restart") 59 | i = app.control.inspect() 60 | act = i.active() 61 | if act is None: 62 | active = 0 63 | else: 64 | active = sum(map(len, act.values())) 65 | job_done = sync_job(job_name, r).done() 66 | if restart != 0 or active != 0 or not job_done: 67 | time.sleep(1) 68 | else: 69 | return 70 | # If we escape the loop, that means the job timed out. 71 | raise TimeoutError( 72 | f"Timed out after [{timeout}] seconds waiting for job [{job_name}] to complete." 73 | ) 74 | 75 | 76 | class test_s3_sync_operations(unittest.TestCase): 77 | @classmethod 78 | def setUpClass(cls): 79 | cls.restart_queue_name = "s3_sync_restart" 80 | cls.path_queue_name = "s3_sync_path" 81 | cls.file_queue_name = "s3_sync_file" 82 | test_lib.clear_redis() 83 | test_lib.irmtrash() 84 | cls.workers = start_workers( 85 | args=[ 86 | "-l", 87 | "info", 88 | "-Q", 89 | f"{cls.restart_queue_name},{cls.path_queue_name},{cls.file_queue_name}", 90 | ] 91 | ) 92 | cls.irods_session = iRODSSession( 93 | **test_lib.get_test_irods_client_environment_dict() 94 | ) 95 | cls.job_name = "test_s3_sync_job" 96 | # TODO(#286): Derive this from the environment... 97 | cls.s3_endpoint_domain = "minio:19000" 98 | # TODO(#286): Derive these from the environment... 99 | cls.s3_access_key = "irods" 100 | cls.s3_secret_key = "irodsadmin" 101 | f = tempfile.NamedTemporaryFile("w+t", delete=False) 102 | # TODO(#264): This will not work on Windows... 103 | f.write(f"{cls.s3_access_key}\n{cls.s3_secret_key}") 104 | f.close() 105 | cls.s3_keypair_path = f.name 106 | # Establish a connection with Minio that persists for every test 107 | cls.minio_client = Minio( 108 | cls.s3_endpoint_domain, 109 | access_key=cls.s3_access_key, 110 | secret_key=cls.s3_secret_key, 111 | secure=False, 112 | ) 113 | cls.bucket_name = "test-s3-put-sync-operation-bucket" 114 | cls.source_path = f"/{cls.bucket_name}" 115 | cls.minio_client.make_bucket(cls.bucket_name) 116 | cls.objects_list = { 117 | "/".join(["shallow_subfolder", "shallow_object.txt"]), 118 | "/".join(["deep_subfolder", "a", "b", "c", "object_c.txt"]), 119 | "/".join(["deep_subfolder", "x", "y", "z", "object_z.txt"]), 120 | "/".join(["top_level_object.txt"]), 121 | } 122 | 123 | @classmethod 124 | def tearDownClass(cls): 125 | test_lib.clear_redis() 126 | test_lib.irmtrash() 127 | cls.irods_session.cleanup() 128 | cls.workers.send_signal(signal.SIGINT) 129 | cls.workers.wait() 130 | cls.minio_client.remove_bucket(cls.bucket_name) 131 | 132 | def create_objects(self, objects_list): 133 | for obj in objects_list: 134 | # The prefix is everything between the bucket name and the "basename" of the object "path". 135 | self.minio_client.put_object( 136 | self.bucket_name, obj, data=io.BytesIO(obj.encode()), length=len(obj) 137 | ) 138 | 139 | def setUp(self): 140 | self.create_objects(self.objects_list) 141 | self.destination_collection = "/".join( 142 | [ 143 | "", 144 | self.irods_session.zone, 145 | "home", 146 | self.irods_session.username, 147 | "s3_sync_collection", 148 | ] 149 | ) 150 | 151 | def tearDown(self): 152 | objects = list(self.minio_client.list_objects(self.bucket_name, recursive=True)) 153 | for obj in objects: 154 | self.minio_client.remove_object(self.bucket_name, obj.object_name) 155 | test_lib.delete_collection_if_exists( 156 | self.destination_collection, recurse=True, force=True 157 | ) 158 | 159 | @staticmethod 160 | def get_event_handler(operation): 161 | operation_strings = { 162 | Operation.NO_OP: "NO_OP", 163 | Operation.REGISTER_SYNC: "REGISTER_SYNC", 164 | Operation.REGISTER_AS_REPLICA_SYNC: "REGISTER_AS_REPLICA_SYNC", 165 | Operation.PUT: "PUT", 166 | Operation.PUT_SYNC: "PUT_SYNC", 167 | Operation.PUT_APPEND: "PUT_APPEND", 168 | } 169 | return textwrap.dedent( 170 | f""" 171 | from irods_capability_automated_ingest.core import Core 172 | from irods_capability_automated_ingest.utils import DeleteMode, Operation 173 | class event_handler(Core): 174 | @staticmethod 175 | def operation(session, meta, **options): 176 | return Operation.{operation_strings[operation]} 177 | """ 178 | ) 179 | 180 | def run_sync( 181 | self, 182 | source_path, 183 | destination_collection, 184 | event_handler_path, 185 | job_name=None, 186 | ignore_cache=False, 187 | files_per_task=1, 188 | log_level=None, 189 | queue_names=tuple(), 190 | expected_failure_count=None, 191 | ): 192 | sync_script = "irods_capability_automated_ingest.irods_sync" 193 | # Construct an invocation of the sync script with various options. 194 | command = [ 195 | "python", 196 | "-m", 197 | sync_script, 198 | "start", 199 | source_path, 200 | destination_collection, 201 | "--event_handler", 202 | event_handler_path, 203 | "--files_per_task", 204 | str(files_per_task), 205 | "--s3_keypair", 206 | self.s3_keypair_path, 207 | "--s3_endpoint_domain", 208 | self.s3_endpoint_domain, 209 | "--s3_insecure_connection", 210 | ] 211 | if ignore_cache: 212 | command.append("--ignore_cache") 213 | if log_level: 214 | command.extend(["--log_level", log_level]) 215 | # The test workers watch non-default queue names so that no other Celery workers which happen to be watching 216 | # the same Redis database will pick up the work. 217 | if not queue_names: 218 | queue_names = tuple( 219 | [self.restart_queue_name, self.path_queue_name, self.file_queue_name] 220 | ) 221 | command.extend(["--restart_queue", queue_names[0]]) 222 | command.extend(["--path_queue", queue_names[1]]) 223 | command.extend(["--file_queue", queue_names[2]]) 224 | # job_name is required so that we can track the sync job and its failed tasks even after it has completed. 225 | if not job_name: 226 | job_name = self.job_name 227 | command.extend(["--job_name", job_name]) 228 | # Now, schedule the job... 229 | proc = subprocess.Popen(command) 230 | proc.wait() 231 | # ...and then wait for the workers to complete the tasks. 232 | try: 233 | wait_for_job_to_finish(self.workers, job_name) 234 | except TimeoutError as e: 235 | self.fail(e) 236 | # Assert that the expected number of failed tasks for this job are found. A value of None means no tasks 237 | # failed for this job. 238 | self.assertEqual( 239 | sync_job(job_name, get_redis(test_lib.get_redis_config())) 240 | .failures_handle() 241 | .get_value(), 242 | expected_failure_count, 243 | ) 244 | 245 | def assert_ingested_contents_exist_in_irods(self): 246 | for obj in self.objects_list: 247 | self.assertTrue( 248 | self.irods_session.data_objects.exists( 249 | "/".join([self.destination_collection, obj]) 250 | ) 251 | ) 252 | 253 | def test_s3_with_put(self): 254 | operation = Operation.PUT 255 | new_object_name = "test_s3_with_put" 256 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 257 | with tempfile.NamedTemporaryFile() as tf: 258 | event_handler_path = tf.name 259 | with open(event_handler_path, "w") as f: 260 | f.write(event_handler_contents) 261 | # Run the first sync and confirm that everything was ingested properly. 262 | self.run_sync( 263 | self.source_path, self.destination_collection, event_handler_path 264 | ) 265 | self.assert_ingested_contents_exist_in_irods() 266 | try: 267 | self.minio_client.put_object( 268 | self.bucket_name, 269 | new_object_name, 270 | data=io.BytesIO(new_object_name.encode()), 271 | length=len(new_object_name), 272 | ) 273 | self.run_sync( 274 | self.source_path, self.destination_collection, event_handler_path 275 | ) 276 | self.assert_ingested_contents_exist_in_irods() 277 | self.assertTrue( 278 | self.irods_session.data_objects.exists( 279 | "/".join([self.destination_collection, new_object_name]) 280 | ) 281 | ) 282 | finally: 283 | self.minio_client.remove_object(self.bucket_name, new_object_name) 284 | 285 | def test_s3_with_put_sync(self): 286 | operation = Operation.PUT_SYNC 287 | new_object_name = "test_s3_with_put_sync" 288 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 289 | with tempfile.NamedTemporaryFile() as tf: 290 | event_handler_path = tf.name 291 | with open(event_handler_path, "w") as f: 292 | f.write(event_handler_contents) 293 | # Run the first sync and confirm that everything was ingested properly. 294 | self.run_sync( 295 | self.source_path, self.destination_collection, event_handler_path 296 | ) 297 | self.assert_ingested_contents_exist_in_irods() 298 | try: 299 | self.minio_client.put_object( 300 | self.bucket_name, 301 | new_object_name, 302 | data=io.BytesIO(new_object_name.encode()), 303 | length=len(new_object_name), 304 | ) 305 | self.run_sync( 306 | self.source_path, self.destination_collection, event_handler_path 307 | ) 308 | self.assert_ingested_contents_exist_in_irods() 309 | self.assertTrue( 310 | self.irods_session.data_objects.exists( 311 | "/".join([self.destination_collection, new_object_name]) 312 | ) 313 | ) 314 | finally: 315 | self.minio_client.remove_object(self.bucket_name, new_object_name) 316 | 317 | def test_s3_with_put_append(self): 318 | operation = Operation.PUT_SYNC 319 | new_object_name = "test_s3_with_put_append" 320 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 321 | with tempfile.NamedTemporaryFile() as tf: 322 | event_handler_path = tf.name 323 | with open(event_handler_path, "w") as f: 324 | f.write(event_handler_contents) 325 | # Run the first sync and confirm that everything was ingested properly. 326 | self.run_sync( 327 | self.source_path, self.destination_collection, event_handler_path 328 | ) 329 | self.assert_ingested_contents_exist_in_irods() 330 | try: 331 | self.minio_client.put_object( 332 | self.bucket_name, 333 | new_object_name, 334 | data=io.BytesIO(new_object_name.encode()), 335 | length=len(new_object_name), 336 | ) 337 | self.run_sync( 338 | self.source_path, self.destination_collection, event_handler_path 339 | ) 340 | self.assert_ingested_contents_exist_in_irods() 341 | self.assertTrue( 342 | self.irods_session.data_objects.exists( 343 | "/".join([self.destination_collection, new_object_name]) 344 | ) 345 | ) 346 | finally: 347 | self.minio_client.remove_object(self.bucket_name, new_object_name) 348 | 349 | def test_s3_with_register_sync(self): 350 | operation = Operation.REGISTER_SYNC 351 | new_object_name = "test_s3_with_register_sync" 352 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 353 | with tempfile.NamedTemporaryFile() as tf: 354 | event_handler_path = tf.name 355 | with open(event_handler_path, "w") as f: 356 | f.write(event_handler_contents) 357 | # Run the first sync and confirm that everything was ingested properly. 358 | self.run_sync( 359 | self.source_path, self.destination_collection, event_handler_path 360 | ) 361 | self.assert_ingested_contents_exist_in_irods() 362 | try: 363 | self.minio_client.put_object( 364 | self.bucket_name, 365 | new_object_name, 366 | data=io.BytesIO(new_object_name.encode()), 367 | length=len(new_object_name), 368 | ) 369 | self.run_sync( 370 | self.source_path, self.destination_collection, event_handler_path 371 | ) 372 | self.assert_ingested_contents_exist_in_irods() 373 | self.assertTrue( 374 | self.irods_session.data_objects.exists( 375 | "/".join([self.destination_collection, new_object_name]) 376 | ) 377 | ) 378 | finally: 379 | self.minio_client.remove_object(self.bucket_name, new_object_name) 380 | 381 | def test_s3_with_register_as_replica_sync(self): 382 | operation = Operation.REGISTER_AS_REPLICA_SYNC 383 | new_object_name = "test_s3_with_register_as_replica_sync" 384 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 385 | with tempfile.NamedTemporaryFile() as tf: 386 | event_handler_path = tf.name 387 | with open(event_handler_path, "w") as f: 388 | f.write(event_handler_contents) 389 | # Run the first sync and confirm that everything was ingested properly. 390 | self.run_sync( 391 | self.source_path, self.destination_collection, event_handler_path 392 | ) 393 | self.assert_ingested_contents_exist_in_irods() 394 | try: 395 | self.minio_client.put_object( 396 | self.bucket_name, 397 | new_object_name, 398 | data=io.BytesIO(new_object_name.encode()), 399 | length=len(new_object_name), 400 | ) 401 | self.run_sync( 402 | self.source_path, self.destination_collection, event_handler_path 403 | ) 404 | self.assert_ingested_contents_exist_in_irods() 405 | self.assertTrue( 406 | self.irods_session.data_objects.exists( 407 | "/".join([self.destination_collection, new_object_name]) 408 | ) 409 | ) 410 | finally: 411 | self.minio_client.remove_object(self.bucket_name, new_object_name) 412 | 413 | def test_register_to_deep_nonexistent_subcollection_does_not_hang_forever__issue_124( 414 | self, 415 | ): 416 | operation = Operation.REGISTER_SYNC 417 | event_handler_contents = test_s3_sync_operations.get_event_handler(operation) 418 | # The destination collection needs to have enough path elements to exceed the number of path elements in 419 | # the "path" to the S3 object. 420 | nested_destination_collection = "/".join( 421 | [self.destination_collection, "a", "b", "c", "d", "e"] 422 | ) 423 | with tempfile.NamedTemporaryFile() as tf: 424 | event_handler_path = tf.name 425 | with open(event_handler_path, "w") as f: 426 | f.write(event_handler_contents) 427 | # Run the first sync and confirm that everything was ingested properly. 428 | self.run_sync( 429 | self.source_path, nested_destination_collection, event_handler_path 430 | ) 431 | for obj in self.objects_list: 432 | self.assertTrue( 433 | self.irods_session.data_objects.exists( 434 | "/".join([nested_destination_collection, obj]) 435 | ) 436 | ) 437 | 438 | 439 | def main(): 440 | unittest.main() 441 | 442 | 443 | if __name__ == "__main__": 444 | main() 445 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/utils.py: -------------------------------------------------------------------------------- 1 | from . import sync_logging 2 | from .sync_job import sync_job 3 | from .custom_event_handler import custom_event_handler 4 | from uuid import uuid1 5 | 6 | from enum import Enum 7 | import os 8 | import stat 9 | 10 | 11 | class Operation(Enum): 12 | REGISTER_SYNC = 0 13 | REGISTER_AS_REPLICA_SYNC = 1 14 | PUT = 2 15 | PUT_SYNC = 3 16 | PUT_APPEND = 4 17 | NO_OP = 5 18 | 19 | 20 | class DeleteMode(Enum): 21 | DO_NOT_DELETE = 0 22 | UNREGISTER = 1 23 | TRASH = 2 24 | NO_TRASH = 3 25 | 26 | 27 | def delete_mode_is_compatible_with_operation(delete_mode, operation): 28 | operation_to_acceptable_delete_modes = { 29 | Operation.NO_OP: [ 30 | DeleteMode.DO_NOT_DELETE, 31 | ], 32 | Operation.REGISTER_SYNC: [ 33 | DeleteMode.DO_NOT_DELETE, 34 | DeleteMode.UNREGISTER, 35 | ], 36 | Operation.REGISTER_AS_REPLICA_SYNC: [ 37 | DeleteMode.DO_NOT_DELETE, 38 | DeleteMode.UNREGISTER, 39 | ], 40 | Operation.PUT: [ 41 | DeleteMode.DO_NOT_DELETE, 42 | ], 43 | Operation.PUT_SYNC: [ 44 | DeleteMode.DO_NOT_DELETE, 45 | DeleteMode.TRASH, 46 | DeleteMode.NO_TRASH, 47 | ], 48 | Operation.PUT_APPEND: [ 49 | DeleteMode.DO_NOT_DELETE, 50 | DeleteMode.TRASH, 51 | DeleteMode.NO_TRASH, 52 | ], 53 | } 54 | return delete_mode in operation_to_acceptable_delete_modes.get(operation, []) 55 | 56 | 57 | def enqueue_task(task, meta): 58 | logger = sync_logging.get_sync_logger(meta["config"]["log"]) 59 | job = sync_job.from_meta(meta) 60 | if job.stop_handle().get_value() is None: 61 | logger.info( 62 | "incr_job_name", task=meta["task"], path=meta["path"], job_name=job.name() 63 | ) 64 | job.tasks_handle().incr() 65 | task_id = str(uuid1()) 66 | timeout = custom_event_handler(meta).timeout() 67 | job.count_handle().rpush(task_id) 68 | task.s(meta).apply_async( 69 | queue=meta["queue_name"], task_id=task_id, soft_time_limit=timeout 70 | ) 71 | else: 72 | # A job by this name is currently being stopped 73 | logger.info( 74 | "async_job_name_stopping", 75 | task=meta["task"], 76 | path=meta["path"], 77 | job_name=job.name(), 78 | ) 79 | 80 | 81 | # Attempt to encode full physical path on local filesystem 82 | # Special handling required for non-encodable strings which raise UnicodeEncodeError 83 | def is_unicode_encode_error_path(path): 84 | try: 85 | _ = path.encode("utf8") 86 | except UnicodeEncodeError: 87 | return True 88 | return False 89 | -------------------------------------------------------------------------------- /irods_capability_automated_ingest/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.0" 2 | -------------------------------------------------------------------------------- /profile/README.md: -------------------------------------------------------------------------------- 1 | === log profile === 2 | 3 | `--profile` `--profile_filename ` `--profile_level INFO` 4 | 5 | === elasticsearch === 6 | 7 | `config/elasticsearch.yml` 8 | 9 | add 10 | 11 | ``` 12 | http.cors.enabled: true 13 | http.cors.allow-origin: "*" 14 | ``` 15 | 16 | === ingest === 17 | 18 | ``` 19 | pip install elasticsearch 20 | ``` 21 | 22 | ``` 23 | python profile.py [ --elasticsearch_host ] [ --additional_key ] 24 | ``` 25 | 26 | === visualize === 27 | 28 | firefox profile.html 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /profile/profile.css: -------------------------------------------------------------------------------- 1 | .vis-item.sync_file { 2 | background-color: #ff8888; 3 | border-color: red; 4 | } 5 | 6 | 7 | .vis-item.sync_dir { 8 | background-color: #88ff88; 9 | border-color: green; 10 | } 11 | 12 | .vis-item.sync_path { 13 | background-color: #8888ff; 14 | border-color: blue; 15 | } 16 | 17 | .vis-item.restart { 18 | background-color: grey; 19 | border-color: yellow; 20 | } 21 | 22 | .vis-item.list_dir { 23 | background-color: #ff8800; 24 | border-color: orange; 25 | } -------------------------------------------------------------------------------- /profile/profile.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | index: 13 |
14 | min: 15 | 16 | max: 17 | 18 |
19 | start: 20 | duration (ms): 21 | #events: 22 | 23 |
24 | 25 | 26 | -------------------------------------------------------------------------------- /profile/profile.js: -------------------------------------------------------------------------------- 1 | function drawChart(){ 2 | let startDate = new Date(document.getElementById('start').value) 3 | if (isNaN(startDate)) 4 | startDate = undefined 5 | let duration = parseInt(document.getElementById('duration').value) 6 | let finishDate = startDate == undefined || duration == undefined ? undefined : new Date(startDate.valueOf() + duration) 7 | let index = document.getElementById('index').value; 8 | if (isNaN(finishDate)) 9 | finishDate = undefined 10 | drawChart2(index, startDate, finishDate); 11 | } 12 | 13 | function getMinAndMaxDate() { 14 | var index = document.getElementById('index').value 15 | let json = { 16 | aggs: { 17 | minDate : { 18 | min : { 19 | field : "start" 20 | } 21 | }, 22 | maxDate : { 23 | max : { 24 | field : "finish" 25 | } 26 | } 27 | } 28 | } 29 | $.ajax({ 30 | type: "POST", 31 | contentType: "application/json", 32 | dataType: "json", 33 | url: "http://localhost:9200/" + index + "/_search?size=0", 34 | data: JSON.stringify(json) 35 | }).done(results => { 36 | let minDate = document.getElementById("minDate") 37 | let maxDate = document.getElementById("maxDate") 38 | minDate.innerHTML = results["aggregations"]["minDate"]["value_as_string"] 39 | maxDate.innerHTML = results["aggregations"]["maxDate"]["value_as_string"] 40 | }) 41 | 42 | } 43 | 44 | function getMin() { 45 | let minDate = document.getElementById("minDate") 46 | return minDate.innerHTML 47 | } 48 | 49 | function getMax() { 50 | let minDate = document.getElementById("maxDate") 51 | return minDate.innerHTML 52 | } 53 | 54 | function setStart(value) { 55 | let startDate = document.getElementById('start') 56 | startDate.value = value 57 | } 58 | 59 | function setFinish(value) { 60 | let startDate = document.getElementById('finish') 61 | startDate.value = value 62 | } 63 | 64 | function groupName(obj) { 65 | let index = obj["index"] 66 | let indexString = "" 67 | if (index < 10) { 68 | indexString = "0" + index 69 | } else { 70 | indexString = "" + index 71 | } 72 | return obj["hostname"]+"/" + indexString 73 | } 74 | 75 | function drawChart2(index, startDate, finishDate) { 76 | const batchsize = 10000 77 | const hits = [] 78 | const json = { 79 | size: batchsize, 80 | query: { 81 | bool: { 82 | should: [ 83 | { 84 | range:{ 85 | start: { 86 | gte: startDate, 87 | lte: finishDate 88 | } 89 | } 90 | }, { 91 | range: { 92 | finish: { 93 | gte: startDate, 94 | lte: finishDate 95 | } 96 | } 97 | } 98 | ], 99 | minimum_should_match: 1 100 | } 101 | } 102 | } 103 | 104 | const handleResults = (sid, remaining, data) => { 105 | data.forEach(h => { 106 | hits.push(h["_source"]) 107 | remaining-- 108 | }) 109 | if(remaining !== 0) { 110 | scroll(sid, remaining) 111 | } else { 112 | showTable(startDate, finishDate, hits) 113 | } 114 | } 115 | 116 | const scroll = (sid, remaining) => { 117 | let json = { 118 | scroll: "1m", 119 | scroll_id: sid 120 | } 121 | $.ajax({ 122 | type: "POST", 123 | contentType: "application/json", 124 | dataType: "json", 125 | url: "http://localhost:9200/_search/scroll", 126 | data: JSON.stringify(json) 127 | }).done(results => { 128 | handleResults(results["_scroll_id"], remaining, results["hits"]["hits"]) 129 | }).fail((a,b,c) => { 130 | console.log(b) 131 | console.log(c) 132 | }) 133 | } 134 | 135 | $.ajax({ 136 | type: "POST", 137 | contentType: "application/json", 138 | dataType: "json", 139 | url: "http://localhost:9200/" + index + "/_search?scroll=1m", 140 | data: JSON.stringify(json) 141 | }).done(results => { 142 | const data = results["hits"] 143 | const total = data["total"] 144 | handleResults(results["_scroll_id"], total, data["hits"]) 145 | }).fail((a,b,c) => { 146 | console.log(b) 147 | console.log(c) 148 | }) 149 | } 150 | 151 | function showTable(startDate, finishDate, hits){ 152 | const container = document.getElementById("visualization") 153 | const groupNames0 = new Set() 154 | 155 | hits.forEach(obj => { 156 | groupNames0.add(groupName(obj)) 157 | }) 158 | 159 | const groupNames = Array.from(groupNames0).sort() 160 | const groups = new vis.DataSet() 161 | const groupMap = {} 162 | for(let g = 0; g < groupNames.length; g++) { 163 | groups.add({id: g, content: groupNames[g]}) 164 | groupMap[groupNames[g]] = g 165 | } 166 | 167 | const colorMap = {} 168 | colorMap["irods_capability_automated_ingest.sync_task.sync_file"] = 'sync_file'; 169 | colorMap["irods_capability_automated_ingest.sync_task.sync_dir"] = 'sync_dir'; 170 | colorMap["irods_capability_automated_ingest.sync_task.sync_path"] = 'sync_path'; 171 | colorMap["irods_capability_automated_ingest.sync_task.restart"] = 'restart'; 172 | colorMap["list_dir"] = 'list_dir'; 173 | 174 | let count = hits.length 175 | document.getElementById("numEvents").innerHTML = count 176 | 177 | const items = new vis.DataSet() 178 | hits.forEach((obj, index) => { 179 | let task_id = obj["event_id"] 180 | let task_name = obj["event_name"] 181 | let start=obj["start"] 182 | let finish=obj["finish"] 183 | let path=obj["path"] 184 | let target=obj["target"] 185 | let taskStartDate = new Date(start) 186 | let taskEndDate = new Date(finish) 187 | items.add({ 188 | id: index, 189 | group: groupMap[groupName(obj)], 190 | content: task_id, 191 | title: `${task_id}
path: ${path}
target: ${target}
start: ${taskStartDate}
finish: ${taskEndDate}`, 192 | start: taskStartDate, 193 | end: taskEndDate, 194 | className: colorMap[task_name] 195 | }) 196 | }) 197 | 198 | let options = { 199 | tooltip: { 200 | overflowMethod: "cap" 201 | }, 202 | moveable: true, 203 | zoomable: true, 204 | selectable: false, 205 | showCurrentTime: false, 206 | stack: false, 207 | groupOrder: "content" 208 | } 209 | if(startDate !== undefined) { 210 | options["min"] = startDate 211 | options["start"] = startDate 212 | options["end"] = new Date(startDate.valueOf() + 1000) 213 | } 214 | if(finishDate !== undefined) { 215 | options["max"] = finishDate 216 | } 217 | 218 | container.innerHTML = "" 219 | let timeline = new vis.Timeline(container) 220 | timeline.setOptions(options) 221 | timeline.setGroups(groups) 222 | timeline.setItems(items) 223 | } 224 | -------------------------------------------------------------------------------- /profile/profile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from elasticsearch import Elasticsearch 4 | from elasticsearch.helpers import bulk 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='Ingest profile data into Elasticsearch') 8 | parser.add_argument('input_file', metavar='INPUT FILE', type=str, 9 | help='input file') 10 | parser.add_argument('--elasticsearch_host', metavar='ELASTICSEARCH HOST', type=str, default="localhost", 11 | help='elasticsearch host') 12 | parser.add_argument('elasticsearch_index', metavar='ELASTICSEARCH INDEX', type=str, 13 | help='elasticsearch index') 14 | parser.add_argument('--additional_key', dest='keys', action='store', nargs="*", default=[], 15 | help='additional key') 16 | 17 | args = parser.parse_args() 18 | 19 | input_file = args.input_file 20 | keys = args.keys 21 | output = args.elasticsearch_host 22 | index = args.elasticsearch_index 23 | 24 | es = Elasticsearch(output) 25 | 26 | try: 27 | es.indices.create(index, body={ 28 | "mappings": { 29 | "document": { 30 | "properties": { 31 | "hostname": { 32 | "type": "keyword" 33 | } 34 | } 35 | } 36 | } 37 | }) 38 | except Exception as e: 39 | print(e) 40 | 41 | def task_action(): 42 | 43 | task_buf = {} 44 | task_counter = {} 45 | 46 | i = 0 47 | with open(input_file, "r") as f: 48 | 49 | line = f.readline().rstrip("\n") 50 | while line != "": 51 | obj = json.loads(line) 52 | 53 | event_id = obj["event_id"] 54 | # print(obj) 55 | buf = task_buf.get(event_id) 56 | if buf is None: 57 | task_buf[event_id] = obj 58 | else: 59 | del task_buf[event_id] 60 | if obj["event"] == "task_prerun": 61 | start = obj["@timestamp"] 62 | finish = buf["@timestamp"] 63 | else: 64 | start = buf["@timestamp"] 65 | finish = obj["@timestamp"] 66 | 67 | event_name = obj["event_name"] 68 | di = { 69 | "start": start, 70 | "finish": finish, 71 | "hostname": obj["hostname"], 72 | "index": obj["index"], 73 | "event_name": event_name, 74 | "event_id": obj["event_id"], 75 | "path": obj.get("path"), 76 | "target": obj.get("target") 77 | } 78 | 79 | for key in keys: 80 | di[key] = obj[key] 81 | 82 | d = { 83 | "_index": index, 84 | "_type": "document", 85 | "_source": di 86 | } 87 | i += 1 88 | print(i) 89 | if event_name in task_counter: 90 | task_counter[event_name] += 1 91 | else: 92 | task_counter[event_name] = 1 93 | yield d 94 | line = f.readline().rstrip("\n") 95 | if len(task_buf) != 0: 96 | print(task_buf) 97 | 98 | print(task_counter) 99 | 100 | 101 | bulk(es, task_action()) 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | amqp==5.2.0 2 | argon2-cffi==23.1.0 3 | argon2-cffi-bindings==21.2.0 4 | billiard==4.2.1 5 | celery==5.4.0 6 | certifi==2024.8.30 7 | cffi==1.17.1 8 | click==8.1.7 9 | click-didyoumean==0.3.1 10 | click-plugins==1.1.1 11 | click-repl==0.3.0 12 | defusedxml==0.7.1 13 | irods-capability-automated-ingest==0.6.0 14 | kombu==5.4.2 15 | minio==7.2.10 16 | prettytable==3.11.0 17 | progressbar2==4.5.0 18 | prompt_toolkit==3.0.48 19 | pycparser==2.22 20 | pycryptodome==3.21.0 21 | python-dateutil==2.9.0.post0 22 | python-irodsclient==2.2.0 23 | python-redis-lock==4.0.0 24 | python-utils==3.9.0 25 | redis==4.6.0 26 | six==1.16.0 27 | structlog==24.4.0 28 | typing_extensions==4.12.2 29 | tzdata==2024.2 30 | urllib3==2.2.3 31 | vine==5.1.0 32 | wcwidth==0.2.13 33 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import codecs 3 | from os import path 4 | 5 | # Get package version 6 | version = {} 7 | here = path.abspath(path.dirname(__file__)) 8 | with open(path.join(here, "irods_capability_automated_ingest/version.py")) as f: 9 | exec(f.read(), version) 10 | 11 | # Get the long description from the README file 12 | with codecs.open(path.join(here, "README.md"), "r", "utf-8") as f: 13 | long_description = f.read() 14 | 15 | setup( 16 | name="irods-capability-automated-ingest", 17 | version=version["__version__"], 18 | description="Implement filesystem scanners and landing zones", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/irods/irods_capability_automated_ingest", 22 | author="iRODS Consortium", 23 | author_email="support@irods.org", 24 | license="BSD", 25 | python_requires=">=3.8,", 26 | classifiers=[ 27 | "Development Status :: 4 - Beta", 28 | "License :: OSI Approved :: BSD License", 29 | "Natural Language :: English", 30 | "Operating System :: POSIX :: Linux", 31 | "Programming Language :: Python", 32 | "Programming Language :: Python :: 3 :: Only", 33 | "Programming Language :: Python :: 3", 34 | "Programming Language :: Python :: 3.8", 35 | "Programming Language :: Python :: 3.9", 36 | "Programming Language :: Python :: 3.10", 37 | "Programming Language :: Python :: 3.11", 38 | "Programming Language :: Python :: 3.12", 39 | ], 40 | keywords="irods automated ingest landingzone filesystem", 41 | packages=find_packages(), 42 | install_requires=[ 43 | "minio", 44 | "python-irodsclient<3.0.0", 45 | "python-redis-lock>=3.2.0", 46 | "redis>=3.4.1, <5.0.0", 47 | "celery[redis]<6.0.0", 48 | "structlog>=18.1.0", 49 | "progressbar2", 50 | ], 51 | setup_requires=["setuptools>=38.6.0"], 52 | entry_points={ 53 | "console_scripts": [ 54 | "irods_capability_automated_ingest=irods_capability_automated_ingest.irods_sync:main" 55 | ], 56 | }, 57 | project_urls={ 58 | "Bug Reports": "https://github.com/irods/irods_capability_automated_ingest/issues", 59 | "Source": "https://github.com/irods/irods_capability_automated_ingest", 60 | }, 61 | ) 62 | --------------------------------------------------------------------------------