├── .gitignore
├── AUTHORS
├── CHANGELOG.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── capability_automated_ingest_filesystem_scanner.jpg
├── capability_automated_ingest_landing_zone.jpg
├── docker
    ├── demo
    │   ├── README.md
    │   ├── compose.yaml
    │   ├── ingest_celery_workers
    │   │   └── Dockerfile
    │   ├── irods_catalog
    │   │   ├── Dockerfile
    │   │   └── init-user-db.sh
    │   └── irods_catalog_provider
    │   │   ├── Dockerfile
    │   │   ├── entrypoint.sh
    │   │   └── setup.input
    └── ingest-test
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── icat
    │       ├── Dockerfile
    │       └── postgres_init.sh
    │   ├── icommands.env
    │   ├── provider
    │       ├── Dockerfile
    │       ├── db_commands.txt
    │       ├── irods_4.2_provider.input
    │       ├── irods_4.3_provider.input
    │       └── start_provider.sh
    │   ├── run_tests.sh
    │   └── test
    │       ├── Dockerfile
    │       ├── Dockerfile.pure
    │       ├── irods_environment.json
    │       └── run_tests.sh
├── irods_capability_automated_ingest
    ├── __init__.py
    ├── celery.py
    ├── char_map_util.py
    ├── core.py
    ├── custom_event_handler.py
    ├── examples
    │   ├── __init__.py
    │   ├── append.py
    │   ├── append_non_leaf_non_root_with_resc_name.py
    │   ├── append_root_with_resc_name.py
    │   ├── append_with_resc_name.py
    │   ├── coll_create_pre_and_post.py
    │   ├── coll_modify_pre_and_post.py
    │   ├── data_obj_create_pre_and_post.py
    │   ├── data_obj_modify_pre_and_post.py
    │   ├── metadata.py
    │   ├── no_op.py
    │   ├── no_retry.py
    │   ├── post_job.py
    │   ├── pre_job.py
    │   ├── put.py
    │   ├── put_non_leaf_non_root_with_resc_name.py
    │   ├── put_root_with_resc_name.py
    │   ├── put_using_char_map.py
    │   ├── put_with_resc_name.py
    │   ├── register.py
    │   ├── register_non_leaf_non_root_with_resc_name.py
    │   ├── register_root_with_resc_name.py
    │   ├── register_using_char_map.py
    │   ├── register_with_peps.py
    │   ├── register_with_resc_name.py
    │   ├── replica_root_with_resc_name.py
    │   ├── replica_with_non_leaf_non_root_resc_name.py
    │   ├── replica_with_resc_name.py
    │   ├── retry.py
    │   ├── statistics.py
    │   ├── sync.py
    │   ├── sync_non_leaf_non_root_with_resc_name.py
    │   ├── sync_retry.py
    │   ├── sync_root_with_resc_name.py
    │   ├── sync_with_resc_name.py
    │   └── timeout.py
    ├── irods
    │   ├── __init__.py
    │   ├── filesystem.py
    │   ├── irods_utils.py
    │   └── s3_bucket.py
    ├── irods_sync.py
    ├── redis_key.py
    ├── redis_utils.py
    ├── sync_actions.py
    ├── sync_job.py
    ├── sync_logging.py
    ├── task_queue.py
    ├── tasks
    │   ├── __init__.py
    │   ├── delete_tasks.py
    │   ├── filesystem_tasks.py
    │   ├── irods_task.py
    │   └── s3_bucket_tasks.py
    ├── test
    │   ├── __init__.py
    │   ├── test_delete_modes.py
    │   ├── test_irods_sync.py
    │   ├── test_lib.py
    │   └── test_s3_bucket_scan.py
    ├── utils.py
    └── version.py
├── profile
    ├── README.md
    ├── profile.css
    ├── profile.html
    ├── profile.js
    └── profile.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *egg-info/
3 | cscope.*
4 | tags
5 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Terrell Russell maintains this project for the iRODS Consortium.
2 | 
3 | Hao Xu wrote the first implementation.
4 | Antoine de Torcy wrote the prototype.
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  6 | and this project **only** adheres to the following _(as defined at [Semantic Versioning](https://semver.org/spec/v2.0.0.html))_:
  7 | 
  8 | > Given a version number MAJOR.MINOR.PATCH, increment the:
  9 | > 
 10 | > - MAJOR version when you make incompatible API changes
 11 | > - MINOR version when you add functionality in a backward compatible manner
 12 | > - PATCH version when you make backward compatible bug fixes
 13 | 
 14 | ## [0.6.0] - 2024-10-14
 15 | 
 16 | This release adds the ability to delete data objects and collections from iRODS which DO NOT exist in the source being ingested. The feature exposes two new events and an event handler method for controlling the delete mode.
 17 | 
 18 | ### Changed
 19 | 
 20 | - Improve documentation (#23, #137, #140, #150, #183, #214, #257, #289).
 21 | - Display more information about jobs in `list` subcommand output (#91).
 22 | - Improve testing, Docker demo, and code formatting (#100, #132, #180).
 23 | - Replace Celery application and tasks (#211).
 24 | - Remove /tmp mount directory from Docker test harness (#235).
 25 | - Bump PRC dependency version to <3.0.0 (#263).
 26 | - Refactor and clean up code (#180, #272, #274).
 27 | - Bump Celery dependency version to <6.0.0 (#266).
 28 | - Improve handling of irods_session (#269).
 29 | - Disable mingling for Celery works in tests (#280).
 30 | - Replace use of "sync" with "tasks" in Celery tasks names (#281).
 31 | 
 32 | ### Removed
 33 | 
 34 | - Remove --append_json option (#60).
 35 | - Remove unnecessary directories and files (#245, #246, #247, #248, #262).
 36 | 
 37 | ### Fixed
 38 | 
 39 | - Handle KeyboardInterrupt for `watch` subcommand (#93).
 40 | - Use logical path for redis_lock key in create_dirs (#124).
 41 | - Do not allow data transfers to redirect by default (#276).
 42 | - Do not add unreadable files to sync chunks (#277).
 43 | - Restore syncs from S3 bucket to iRODS (#285).
 44 | - Fix periodic task name (#293).
 45 | 
 46 | ### Added
 47 | 
 48 | - Add Delete mode (#48, #261, #288).
 49 | - Track start time of sync jobs (#92).
 50 | - Track jobs stopped by "irods_sync stop" (#210).
 51 | - Add Docker Compose project for testing (#244).
 52 | 
 53 | ## [v0.5.0] - 2024-07-17
 54 | 
 55 | This release adds more functionality when scanning an S3 bucket
 56 | and updates the testing harness and a number of dependencies.
 57 | 
 58 | Note: The signatures for all pre/post event handler methods
 59 | (e.g. `pre_data_obj_create`) have been changed to include an
 60 | `*args` parameter.  Any existing event handler files will need
 61 | to be updated by inserting the new parameter just before the
 62 | `**options` keyword argument:
 63 | 
 64 | ```diff
 65 |      @staticmethod
 66 | -    def pre_data_obj_create(hdlr_mod, logger, session, meta, **options):
 67 | +    def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
 68 | ```
 69 | 
 70 | - [#180] Add tags to gitignore
 71 | - [#219] Add tests for pre/post event handler methods
 72 | - [#219] Add *args to all example pre/post event handler methods
 73 | - [#180] Rename 'syncer' to 'scanner_instance'
 74 | - Revert "[#219] Add 'op' and 'scanner' to meta"
 75 | - Revert "[#180] Rename 'syncer' to 'scanner_instance'"
 76 | - Bump certifi from 2023.7.22 to 2024.7.4
 77 | - [#180] Rename 'syncer' to 'scanner_instance'
 78 | - [#219] Add 'op' and 'scanner' to meta
 79 | - [#222] Use %-formatting in log statement
 80 | - Bump urllib3 from 1.26.18 to 1.26.19
 81 | - [#232] Update deployment instructions in README
 82 | - [#216] Remove non-Compose test instructions
 83 | - [#174] Update Redis instructions in README
 84 | - Bump werkzeug from 2.3.8 to 3.0.3
 85 | - Bump jinja2 from 3.1.3 to 3.1.4
 86 | - Bump certifi from 2022.12.7 to 2023.7.22
 87 | - Bump urllib3 from 1.26.5 to 1.26.18
 88 | - Bump jinja2 from 2.11.3 to 3.1.3
 89 | - Bump flask from 1.0.2 to 2.2.5
 90 | - Bump werkzeug from 2.2.3 to 2.3.8
 91 | - Bump redis from 2.10.6 to 4.4.4
 92 | - [#215] Fix test failures
 93 | - [#220] Update test environment
 94 | - [#180] Update supported Python versions
 95 | - [#180] black formatter - no functional changes
 96 | - [#212] changed REGISTER to REGISTER_SYNC
 97 | - [#207] multi read and write from S3 to iRODS for put, putsync
 98 | - [#129] Added functionality for PUT, PUT_SYNC with S3 via Minio
 99 | - [#129] put_sync functionality for data in S3
100 | 
101 | ## [v0.4.2] - 2023-06-26
102 | 
103 | This release fixes the exclude and post_job behavior
104 | and updates two dependencies.
105 | 
106 | - [#200] Add --exclude_file_type test
107 | - [#201] Amend test for post_job
108 | - [#195] apply CELERY_BROKER_URL env var globally to tests
109 | - [#198] update to Python 3.11 in docker test suite
110 | - [#201] Fix job done condition
111 | - [#200] Fix exclude_file_name/exclude_file_type
112 | - [#200] Add test for --exclude_file_name
113 | - Bump certifi from 2018.11.29 to 2022.12.7
114 | - Bump werkzeug from 0.15.3 to 2.2.3
115 | 
116 | ## [v0.4.1] - 2023-03-26
117 | 
118 | This release fixes an exit code bug and adds a
119 | character_map event handler method.
120 | 
121 | - [#188] eliminate exit call in check_event_handler
122 | - [#40][#166] tests work for unicodeEncodeError and char_map put/register
123 | - [#166] implement object path character remapping (with AVU hints)
124 | - [#180] add .gitignore
125 | - [#177] Fix wrong exit code with --synchronous option
126 | 
127 | ## [v0.4.0] - 2022-02-24
128 | 
129 | This release abstracts the scanners, eases deployment
130 | by putting the event handler in redis, provides better
131 | SSL support, and now requires Python 3.7+.
132 | 
133 | - [#171] Un-skip tests with resolved issues
134 | - [#167] Bump versions in setup.py and test image
135 | - [#170] Fix tests to use event_handler files
136 | - Bump celery from 4.2.1 to 5.2.2
137 | - Bump urllib3 from 1.24.2 to 1.26.5
138 | - Bump jinja2 from 2.10 to 2.11.3
139 | - [#102] event_handler goes into redis
140 | - [#159] add performance benchmark test harness
141 | - [#147][#157] Allow running workers with env only
142 | - [#156] modified test to use resc_hier string
143 | - [#155] added helper for unicode errors and renamed variables
144 | - [#110] Add several interfaces for refactor
145 | - [irods/python-irodsclient#237] load certificate into ssl context
146 | - fixed the parsing of the S3 region parameter
147 | - Bump werkzeug from 0.14.1 to 0.15.3
148 | - [#125] Add non-SSL connection option for S3
149 | - [#86][#117] Test suite cleanup + docker image
150 | - Correct README.md for docker instructions
151 | - [#109] Update docker steps for Celery
152 | - [#114] Remove zone hint check
153 | - [#90] Honor CELERY_BROKER_URL when present
154 | 
155 | ## [v0.3.8] - 2019-11-12
156 | 
157 | This release fixes handling of stopped periodic jobs
158 | 
159 | - [#103] revoke scheduled celery restart jobs on stop
160 | 
161 | ## [v0.3.7] - 2019-08-27
162 | 
163 | This release fixes a prefix handling bug when scanning S3.
164 | 
165 | - [#98] Preserve trailing slash for S3 prefix
166 | 
167 | ## [v0.3.6] - 2019-08-14
168 | 
169 | This release fixes a path registration bug when scanning
170 | S3 and updates a dependency.
171 | 
172 | - Bump urllib3 from 1.24.1 to 1.24.2 
173 | - [#95] Replaced lstrip with index and offset
174 | 
175 | ## [v0.3.5] - 2019-04-10
176 | 
177 | This release adds support for non utf-8 filenames
178 | and tests for code coverage.
179 | 
180 | - [#88] Limit Celery version
181 | - [#63] make easier to test against a non-default zone
182 | - [#63] Add more UnicodeEncodeError tests
183 | - [#51] Add tests for event handler PEPs
184 | - [#31] Handle invalid zone name in target coll
185 | - [#31] Add test for invalid zone name
186 | - [#76] Add max redis version and requirements.txt
187 | - [#40] Handle UnicodeEncodeError filenames for PUT
188 | - [#40] Add tests for non-encodeable filename
189 | - [#78] Add documentation around VM overcommitting
190 | 
191 | ## [v0.3.4] - 2018-11-15
192 | 
193 | - [#76] Pin redis version to 2.10.6
194 | 
195 | ## [v0.3.3] - 2018-10-27
196 | 
197 | - [#75] Honor SSL parameters in irods_environment.json
198 | 
199 | ## [v0.3.2] - 2018-09-25
200 | 
201 | - [#69] Don't follow symlinks to dirs
202 | 
203 | ## [v0.3.1] - 2018-09-20
204 | 
205 | - [#49] Fix S3 syncing dir and registering folder
206 | 
207 | ## [v0.3.0] - 2018-09-19
208 | 
209 | This release adds support for scanning S3 in addition to
210 | locally mounted filesystems.  To improve performance, a
211 | default Celery worker will now work on 50 files, rather than 1.
212 | 
213 | - [#49] Add support for scanning S3
214 | - [#51] Fix policy points for syncing directories
215 | - [#52] Remove list_dir option
216 | 
217 | ## [v0.2.2] - 2018-09-10
218 | 
219 | - [#50] fixed invocation used for collection events
220 | 
221 | ## [v0.2.1] - 2018-09-06
222 | 
223 | - [#45] check permission before enqueueing a file/dir
224 | - [#46] add missing scandir dependency
225 | - [#47] only call cancel if timer is instantiated
226 | 
227 | ## [v0.2.0] - 2018-09-03
228 | 
229 | - Swap queueing technology to Celery from RedisQueue
230 | - Handles non-utf8-encodeable filenames
231 | - Allows filetype/filename/directory exclusions
232 | - Adds performance profiler
233 | - Adds a NO_OP operation
234 | 
235 | ## [v0.1.0] - 2018-05-11
236 | 
237 | - Initial release
238 | - Python3 required
239 | - Includes five operations
240 | - Includes logging
241 | - Nascent support for Docker, Kubernetes, and Helm
242 | 
243 | [Unreleased]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.8...HEAD
244 | [v0.3.8]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.7...v0.3.8
245 | [v0.3.7]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.6...v0.3.7
246 | [v0.3.6]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.5...v0.3.6
247 | [v0.3.5]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.4...v0.3.5
248 | [v0.3.4]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.3...v0.3.4
249 | [v0.3.3]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.2...v0.3.3
250 | [v0.3.2]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.1...v0.3.2
251 | [v0.3.1]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.3.0...v0.3.1
252 | [v0.3.0]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.2...v0.3.0
253 | [v0.2.2]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.1...v0.2.2
254 | [v0.2.1]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.2.0...v0.2.1
255 | [v0.2.0]: https://github.com/irods/irods_capability_automated_ingest/compare/v0.1.0...v0.2.0
256 | [v0.1.0]: https://github.com/irods/irods_capability_automated_ingest/compare/11f9825df721a19dd25dad70aa94e5aa73d1d941...v0.1.0
257 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, The University of North Carolina at Chapel Hill
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | - Neither the name of the University of North Carolina at Chapel Hill nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS CHANGELOG.md LICENSE.txt README.md


--------------------------------------------------------------------------------
/capability_automated_ingest_filesystem_scanner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/capability_automated_ingest_filesystem_scanner.jpg


--------------------------------------------------------------------------------
/capability_automated_ingest_landing_zone.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/capability_automated_ingest_landing_zone.jpg


--------------------------------------------------------------------------------
/docker/demo/README.md:
--------------------------------------------------------------------------------
  1 | # Ingest Demo Compose Project
  2 | 
  3 | **DO NOT USE THIS IN PRODUCTION!!**
  4 | 
  5 | Use this Compose project to test out the ingest tool. There is a Docker volume shared between the iRODS service and the ingest workers that can be used for testing scans. There is another shared volume used to host the Minio storage.
  6 | 
  7 | It's easiest to try out scanning things from the `ingest-celery-workers` service instance.
  8 | 
  9 | ## Build
 10 | 
 11 | ```
 12 | docker compose build
 13 | ```
 14 | 
 15 | The `ingest-celery-workers` service has a build argument that allows for controlling the version of the ingest package. Here's how to use it:
 16 | 
 17 | ```
 18 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=git+https://github.com/irods/irods_capability_automated_ingest@main
 19 | ```
 20 | 
 21 | This will clone the specified git repository and checkout the commit-ish specified. You could also specify a released version:
 22 | 
 23 | ```
 24 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=irods-capability-automated-ingest==0.4.2
 25 | ```
 26 | 
 27 | If no `--build-arg` is specified, the default build will install the latest released version of the package from PyPI. The following is equivalent to not specifying a `--build-arg` when building the project:
 28 | ```
 29 | docker compose build --build-arg IRODS_AUTOMATED_INGEST_PIP_PACKAGE=irods-capability-automated-ingest
 30 | ```
 31 | 
 32 | ## Running the project
 33 | 
 34 | This demo simply starts the services and leaves them running with the expectation that commands will be issued to them either through `docker exec` or via client requests to the various endpoints.
 35 | 
 36 | It is a simple project, so starting and stopping it are straightforward.
 37 | 
 38 | To bring the project up:
 39 | 
 40 | ```
 41 | docker compose up
 42 | ```
 43 | 
 44 | To bring the project down:
 45 | 
 46 | ```
 47 | docker compose down
 48 | ```
 49 | 
 50 | The other `docker compose` commands (`start`, `stop`, `restart`, etc.) should work as expected, as well.
 51 | 
 52 | If you wish to adjust the Celery concurrency, modify the Compose YAML file to adjust the `command` run by the `ingest-celery-workers` service:
 53 | ```yaml
 54 | command: ["-c", "2"] # Adjust the "2" value to whatever concurrency you want
 55 | ```
 56 | The `command` can only be adjusted before the container is created, so if you wish to adjust the concurrency after the project is already up, you will need to recreate the `ingest-celery-workers` service instance containers.
 57 | 
 58 | ## Scanning an S3 bucket
 59 | 
 60 | Change the port exposed by the `minio` service, if needed, so that the MinIO Console can be accessed. The MinIO server is being run with access key `irods` and secret key `irodsadmin`. The place from which the job is launched should have a keypair file with these credentials:
 61 | ```
 62 | irods
 63 | irodsadmin
 64 | ```
 65 | 
 66 | To perform a basic scan of an S3 bucket called, for example, `ingest-test-bucket`, run something like the following:
 67 | 
 68 | ```
 69 | python3 -m irods_capability_automated_ingest.irods_sync start \
 70 |     /ingest-test-bucket \
 71 |     /tempZone/home/rods/ingest-test-bucket \
 72 |     --s3_keypair /path/to/s3keypair.txt \
 73 |     --s3_endpoint_domain minio:19000 \
 74 |     --s3_insecure_connection \
 75 |     --synchronous \
 76 |     --progress
 77 | ```
 78 | 
 79 | It's easiest to try out scanning things from the `ingest-celery-workers` service instance.
 80 | 
 81 | ## Performance testing
 82 | 
 83 | While using Docker is not going to get you the best possible performance numbers, it can be useful for benchmarking certain tasks in a reproducible environment.
 84 | 
 85 | This section will describe some interesting things you can do to test out various configurations for performance.
 86 | 
 87 | ### Celery configuration
 88 | 
 89 | As mentioned in other sections, the `concurrency` configuration can be changed before container creation for the `ingest-celery-workers` service by overriding the `command` in the Docker Compose YAML file. This affects the number of Celery workers in a given service instance.
 90 | 
 91 | Celery has a number of other configurations for the workers which can help with performance: [https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker](https://docs.celeryq.dev/en/stable/userguide/configuration.html#worker)
 92 | 
 93 | ### Docker Compose service scaling
 94 | 
 95 | The `ingest-celery-workers` service can be "scaled up" using the `--scale` option of `docker compose up`. The default scale is 1 service instance, but the scale can be adjusted like this:
 96 | ```bash
 97 | docker compose up --scale ingest-celery-workers=4 # replace 4 with desired number of instances
 98 | ```
 99 | The above line will spawn 4 instances (containers) of the `ingest-celery-workers` service with each instance having a `concurrency` of whatever has been configured. With the default configuration, this would be 2, for a total of 8 workers across the 4 containers. This can even be done when the project is already up to scale the number of instances up without affecting the existing containers. This can of course be used to scale *down* the number of instances as well.
100 | 
101 | ### Network manipulation with Traffic Control (`tc`)
102 | 
103 | `tc` can be used to simulate network delays and other networking conditions that may not ordinarily be present. See the `tc` documentation for more information: [https://linux.die.net/man/8/tc](https://linux.die.net/man/8/tc)
104 | 
105 | Network traffic manipulation requires enabling the additional capability `NET_ADMIN` in the target containers. Remember that "additional capabilities" can only be added at container creation. This can be done a number of different ways, but the simplest way for this project is to add the following `cap_add` stanza to the `ingest-celery-workers` service in the Docker Compose YAML file:
106 | ```yaml
107 | cap_add:
108 |     - NET_ADMIN
109 | ```
110 | 
111 | Here are some useful commands to try executing inside the `ingest-celery-workers` service instance containers for manipulating network traffic:
112 | ```bash
113 | tc qdisc add dev eth0 root netem delay 100ms # to add rule
114 | tc qdisc show dev eth0 # to show rules
115 | tc qdisc del dev eth0 root netem # to delete rule
116 | ```
117 | Note: In order to run `tc`, the proper package must be installed in the container(s) in which the command will be running. For most Linux distributions, this is `iproute2`.
118 | 


--------------------------------------------------------------------------------
/docker/demo/compose.yaml:
--------------------------------------------------------------------------------
 1 | name: irods-ingest-demo
 2 | 
 3 | services:
 4 |     redis:
 5 |         image: redis:7
 6 | 
 7 |     irods-catalog:
 8 |         build:
 9 |             context: irods_catalog
10 |         environment:
11 |             - POSTGRES_PASSWORD=testpassword
12 | 
13 |     irods-catalog-provider:
14 |         build:
15 |             context: irods_catalog_provider
16 |         healthcheck:
17 |             test: ["CMD", "su", "-", "irods", "-c", "./irodsctl status | grep Process"]
18 |             interval: 10s
19 |             timeout: 10s
20 |             retries: 3
21 |             start_period: 20s
22 |             start_interval: 10s
23 |         volumes:
24 |             - shared_volume:/data/ufs
25 |         depends_on:
26 |             irods-catalog:
27 |                 condition: service_started
28 | 
29 |     ingest-celery-workers:
30 |         build:
31 |             context: ingest_celery_workers
32 |         environment:
33 |             - CELERY_BROKER_URL=redis://redis:6379/0
34 |             - IRODS_PORT=1247
35 |             - IRODS_HOST=irods-catalog-provider
36 |             - IRODS_USER_NAME=rods
37 |             - IRODS_ZONE_NAME=tempZone
38 |             - IRODS_PASSWORD=rods
39 |         volumes:
40 |             - shared_volume:/data/ufs
41 |         depends_on:
42 |             redis:
43 |                 condition: service_started
44 |             irods-catalog-provider:
45 |                 condition: service_healthy
46 |         command: ["-c", "2", "--loglevel", "INFO", "-n", "ingest-demo"] # Configure Celery options here. Note: Only takes effect at container creation.
47 | 
48 |     minio:
49 |         image: minio/minio:RELEASE.2024-09-13T20-26-02Z
50 |         ports:
51 |             - "19000:19000" # This is the port to use for issuing S3 requests.
52 |             - "19001:19001" # Change this port, if needed, to access the MinIO console webpage.
53 |         command: minio server /data/minio-s3
54 |         environment: 
55 |             MINIO_ROOT_USER: irods
56 |             MINIO_ROOT_PASSWORD: irodsadmin
57 |             MINIO_ADDRESS: ":19000"
58 |             MINIO_CONSOLE_ADDRESS: ":19001"
59 | 
60 | volumes:
61 |     shared_volume:
62 | 


--------------------------------------------------------------------------------
/docker/demo/ingest_celery_workers/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11
 2 | 
 3 | ARG IRODS_AUTOMATED_INGEST_PIP_PACKAGE="irods-capability-automated-ingest"
 4 | 
 5 | RUN pip install ${IRODS_AUTOMATED_INGEST_PIP_PACKAGE}
 6 | 
 7 | ENTRYPOINT ["celery", "-A", "irods_capability_automated_ingest", "worker", "-Q", "restart,path,file"]
 8 | 
 9 | # Override the command at runtime to adjust Celery concurrency and other options.
10 | CMD ["-c", "2"]
11 | 


--------------------------------------------------------------------------------
/docker/demo/irods_catalog/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:14
2 | 
3 | COPY init-user-db.sh /docker-entrypoint-initdb.d/init-user-db.sh
4 | 


--------------------------------------------------------------------------------
/docker/demo/irods_catalog/init-user-db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Adapted from "Initialization script" in documentation for official Postgres dockerhub:
 4 | #   https://hub.docker.com/_/postgres/
 5 | set -e
 6 | 
 7 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
 8 |     CREATE DATABASE "ICAT";
 9 |     CREATE USER irods WITH PASSWORD 'testpassword';
10 |     GRANT ALL PRIVILEGES ON DATABASE "ICAT" to irods;
11 | EOSQL
12 | 


--------------------------------------------------------------------------------
/docker/demo/irods_catalog_provider/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y \
 7 |         apt-transport-https \
 8 |         gnupg \
 9 |         wget \
10 |     && \
11 |     apt-get clean && \
12 |     rm -rf /var/lib/apt/lists/* /tmp/*
13 | 
14 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add - && \
15 |     echo "deb [arch=amd64] https://packages.irods.org/apt/ jammy main" | tee /etc/apt/sources.list.d/renci-irods.list
16 | 
17 | RUN apt-get update && \
18 |     apt-get install -y \
19 |         libcurl4-gnutls-dev \
20 |         python3 \
21 |         python3-distro \
22 |         python3-jsonschema \
23 |         python3-pip \
24 |         python3-psutil \
25 |         python3-requests \
26 |         rsyslog \
27 |         unixodbc \
28 |     && \
29 |     apt-get clean && \
30 |     rm -rf /var/lib/apt/lists/* /tmp/*
31 | 
32 | RUN apt-get update && \
33 |     apt-get install -y \
34 |         irods-database-plugin-postgres \
35 |         irods-runtime \
36 |         irods-server \
37 |     && \
38 |     apt-get clean && \
39 |     rm -rf /var/lib/apt/lists/* /tmp/*
40 | 
41 | COPY setup.input /
42 | RUN mv /setup.input /irods_setup.input
43 | 
44 | COPY entrypoint.sh /
45 | RUN chmod u+x /entrypoint.sh
46 | ENTRYPOINT ["/entrypoint.sh"]
47 | 


--------------------------------------------------------------------------------
/docker/demo/irods_catalog_provider/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -e
 2 | 
 3 | catalog_db_hostname=irods-catalog
 4 | 
 5 | echo "Waiting for iRODS catalog database to be ready"
 6 | 
 7 | until pg_isready -h ${catalog_db_hostname} -d ICAT -U irods -q
 8 | do
 9 |     sleep 1
10 | done
11 | 
12 | echo "iRODS catalog database is ready"
13 | 
14 | setup_input_file=/irods_setup.input
15 | 
16 | if [ -e "${setup_input_file}" ]; then
17 |     echo "Running iRODS setup"
18 |     python3 /var/lib/irods/scripts/setup_irods.py < "${setup_input_file}"
19 |     rm /irods_setup.input
20 | fi
21 | 
22 | echo "Starting server"
23 | 
24 | cd /usr/sbin
25 | su irods -c 'bash -c "./irodsServer -u"'
26 | 


--------------------------------------------------------------------------------
/docker/demo/irods_catalog_provider/setup.input:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | irods-catalog
 6 | 5432
 7 | ICAT
 8 | irods
 9 | y
10 | testpassword
11 | 
12 | y
13 | demoResc
14 | 
15 | tempZone
16 | 1247
17 | 20000
18 | 20199
19 | 1248
20 | 
21 | rods
22 | y
23 | TEMPORARY_ZONE_KEY
24 | 32_byte_server_negotiation_key__
25 | 32_byte_server_control_plane_key
26 | rods
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/docker/ingest-test/README.md:
--------------------------------------------------------------------------------
 1 | # How to run the test suite using docker-compose
 2 | 
 3 | ## Step 1: Build the images
 4 | 
 5 | Run the following to build the required images:
 6 | ```
 7 | docker compose build
 8 | ```
 9 | When testing against an alternative version of iRODS, there are three variables in the `docker-compose.yml` file which must be changed prior to the build step. For example, if testing against iRODS 4.3.3:
10 | ```
11 |   irods-catalog-provider:
12 |     build:
13 |       args:
14 |           irods_version: 4.3.3-0~jammy
15 |           irods_version_major_minor: 4.3
16 |           py_version: 3
17 | ```
18 | Note that, depending on whether the iRODS major/minor version is 4.2 or 4.3, the `py_version` takes on the possible values `""` or `"3"`, respectively.
19 | 
20 | ## Step 2: Run the project
21 | 
22 | Bring up the docker-compose project and the test suite will run on its own:
23 | ```
24 | docker compose --env-file icommands.env up
25 | ```
26 | The test suite is one of the services of the docker-compose project, so it will run on its own. The container is tied to the tests running, so it will exit once completed.
27 | The `--env-file` option is required in order to correctly configure the environment for the tests.
28 | 
29 | ## Step 3: Bring down the project
30 | 
31 | The project is not made to come down by itself (yet), so it has to be brought down after each run:
32 | ```
33 | docker compose down
34 | ```
35 | 


--------------------------------------------------------------------------------
/docker/ingest-test/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   some-redis:
 5 |     image: redis
 6 |     hostname: redis
 7 |     networks:
 8 |       default:
 9 |         aliases:
10 |           - redis
11 | 
12 |   icat:
13 |     build:
14 |       context: icat
15 |       args:
16 |         postgres_password: testpassword
17 |     hostname: catalog.example.org
18 |     networks:
19 |       default:
20 |         aliases:
21 |           - catalog.example.org
22 | 
23 |   irods-catalog-provider:
24 |     build:
25 |       context: provider
26 |       args:
27 |         irods_version: 4.3.3-0~jammy
28 |         irods_version_major_minor: 4.3
29 |         py_version: 3
30 |     hostname: icat.example.org
31 |     networks:
32 |       default:
33 |         aliases:
34 |           - icat.example.org
35 |     volumes:
36 |       - shared_volume:/data/ufs
37 |     depends_on:
38 |       - icat
39 | 
40 |   ingest-test:
41 |     build:
42 |       context: test
43 |     environment:
44 |       - "PIP_PACKAGE"
45 |       - "TEST_CASE"
46 |       - "IRODS_PORT"
47 |       - "IRODS_HOST"
48 |       - "IRODS_USER_NAME"
49 |       - "IRODS_ZONE_NAME"
50 |       - "IRODS_ENVIRONMENT_FILE"
51 |       - "IRODS_PASSWORD"
52 |     volumes:
53 |       - shared_volume:/data/ufs
54 |     depends_on:
55 |       - some-redis
56 |       - irods-catalog-provider
57 | 
58 |   minio:
59 |     image: minio/minio:RELEASE.2024-09-13T20-26-02Z
60 |     ports:
61 |       - "19000:19000" # This is the port to use for issuing S3 requests.
62 |       - "19001:19001" # Change this port, if needed, to access the MinIO console webpage.
63 |     command: minio server /data/minio-s3
64 |     environment:
65 |       MINIO_ROOT_USER: irods
66 |       MINIO_ROOT_PASSWORD: irodsadmin
67 |       MINIO_ADDRESS: ":19000"
68 |       MINIO_CONSOLE_ADDRESS: ":19001"
69 | 
70 | volumes:
71 |     shared_volume:
72 | 


--------------------------------------------------------------------------------
/docker/ingest-test/icat/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:11
2 | 
3 | ARG postgres_password
4 | ENV POSTGRES_PASSWORD ${postgres_password}
5 | 
6 | COPY postgres_init.sh /docker-entrypoint-initdb.d/
7 | 


--------------------------------------------------------------------------------
/docker/ingest-test/icat/postgres_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
5 |     CREATE USER irods with password 'testpassword';
6 |     CREATE DATABASE "ICAT";
7 |     GRANT ALL PRIVILEGES ON DATABASE "ICAT" TO irods;
8 | EOSQL
9 | 


--------------------------------------------------------------------------------
/docker/ingest-test/icommands.env:
--------------------------------------------------------------------------------
1 | IRODS_PORT=1247
2 | IRODS_HOST=icat.example.org
3 | IRODS_USER_NAME=rods
4 | IRODS_ZONE_NAME=tempZone
5 | IRODS_ENVIRONMENT_FILE=/irods_environment.json
6 | IRODS_PASSWORD=rods
7 | PIP_PACKAGE=git+https://github.com/irods/irods_capability_automated_ingest@main
8 | TEST_CASE="irods_capability_automated_ingest.test.test_s3_bucket_scan irods_capability_automated_ingest.test.test_delete_modes irods_capability_automated_ingest.test.test_irods_sync"
9 | 


--------------------------------------------------------------------------------
/docker/ingest-test/provider/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # iRODS Provider Image.
 3 | #
 4 | FROM ubuntu:22.04
 5 | ARG irods_version
 6 | ARG irods_version_major_minor
 7 | ARG py_version
 8 | ENV PY_VERSION="${py_version}"
 9 | 
10 | ENV DEBIAN_FRONTEND=noninteractive
11 | 
12 | RUN apt-get update && \
13 |     apt-get install -y \
14 |         apt-transport-https \
15 |         gnupg \
16 |         wget \
17 |     && \
18 |     apt-get clean && \
19 |     rm -rf /var/lib/apt/lists/* /tmp/*
20 | 
21 | # Install pre-requisites
22 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add - && \
23 |     echo "deb [arch=amd64] https://packages.irods.org/apt/ jammy main" | tee /etc/apt/sources.list.d/renci-irods.list
24 | 
25 | RUN apt-get update && \
26 |     apt-get install -y \
27 |         libcurl4-gnutls-dev \
28 |         python3 \
29 |         python3-distro \
30 |         python3-jsonschema \
31 |         python3-pip \
32 |         python3-psutil \
33 |         python3-requests \
34 |         rsyslog \
35 |         unixodbc \
36 |     && \
37 |     apt-get clean && \
38 |     rm -rf /var/lib/apt/lists/* /tmp/*
39 | 
40 | RUN wget -qO - https://packages.irods.org/irods-signing-key.asc | apt-key add -; \
41 |     echo "deb [arch=amd64] https://packages.irods.org/apt/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/renci-irods.list; \
42 |     apt-get update && \
43 |     apt-get install -y \
44 |         irods-runtime=${irods_version} \
45 | 	    irods-icommands=${irods_version} \
46 | 	    irods-server=${irods_version} \
47 | 	    irods-database-plugin-postgres=${irods_version}
48 | 
49 | # Set command to execute when launching the container.
50 | COPY --chmod=755 start_provider.sh /
51 | COPY irods_${irods_version_major_minor}_provider.input /irods_provider.input
52 | ENTRYPOINT ["./start_provider.sh"]
53 | 


--------------------------------------------------------------------------------
/docker/ingest-test/provider/db_commands.txt:
--------------------------------------------------------------------------------
1 | CREATE DATABASE "ICAT";
2 | CREATE USER irods WITH PASSWORD 'testpassword';
3 | GRANT ALL PRIVILEGES ON DATABASE "ICAT" to irods;
4 | \q
5 | 


--------------------------------------------------------------------------------
/docker/ingest-test/provider/irods_4.2_provider.input:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | catalog.example.org
 6 | 5432
 7 | ICAT
 8 | irods
 9 | y
10 | testpassword
11 | 
12 | tempZone
13 | 1247
14 | 20000
15 | 20199
16 | 1248
17 | 
18 | rods
19 | y
20 | TEMPORARY_ZONE_KEY
21 | 32_byte_server_negotiation_key__
22 | 32_byte_server_control_plane_key
23 | rods
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/docker/ingest-test/provider/irods_4.3_provider.input:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | catalog.example.org
 6 | 5432
 7 | ICAT
 8 | irods
 9 | y
10 | testpassword
11 | 
12 | y
13 | demoResc
14 | /var/lib/irods/Vault
15 | tempZone
16 | 1247
17 | 20000
18 | 20199
19 | 1248
20 | 
21 | rods
22 | y
23 | TEMPORARY_ZONE_KEY
24 | 32_byte_server_negotiation_key__
25 | 32_byte_server_control_plane_key
26 | rods
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/docker/ingest-test/provider/start_provider.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Start the Postgres database.
 4 | counter=0
 5 | until pg_isready -h catalog.example.org -d ICAT -U irods -q
 6 | do
 7 |     sleep 1
 8 |     ((counter += 1))
 9 | done
10 | echo Postgres took approximately $counter seconds to fully start ...
11 | 
12 | # Set up iRODS if not already done
13 | if [ ! -e /var/lib/irods/setup_complete ]
14 |     then
15 |         python${PY_VERSION} /var/lib/irods/scripts/setup_irods.py < /irods_provider.input
16 | fi
17 | 
18 | # run the server
19 | su - irods -c "/var/lib/irods/irodsctl restart"
20 | 
21 | touch /var/lib/irods/setup_complete
22 | 
23 | # Keep container running if the test fails.
24 | tail -f /dev/null
25 | # Is this better? sleep 2147483647d
26 | 
27 | 


--------------------------------------------------------------------------------
/docker/ingest-test/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | docker-compose --env-file icommands.env up &
 6 | 
 7 | until [ $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) ]; do
 8 |     #echo "waiting for container to exist"
 9 |     sleep 1
10 | done
11 | 
12 | while [ ! $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) == "running" ]; do
13 |     #echo "waiting for container to run"
14 |     sleep 1
15 | done
16 | 
17 | #echo "test container is up"
18 | 
19 | while [ $(docker container inspect -f '{{.State.Status}}' ingest-test_ingest-test_1) == "running" ]; do
20 |     #echo "waiting for tests to finish"
21 |     sleep 1
22 | done
23 | 
24 | docker-compose down
25 | 
26 | 


--------------------------------------------------------------------------------
/docker/ingest-test/test/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11
 2 | 
 3 | RUN apt update && apt install -y netcat-traditional
 4 | 
 5 | COPY irods_environment.json /
 6 | 
 7 | ENV TEST_CASE=${TEST_CASE}
 8 | 
 9 | COPY run_tests.sh /
10 | RUN chmod u+x /run_tests.sh
11 | ENTRYPOINT ["./run_tests.sh"]
12 | 


--------------------------------------------------------------------------------
/docker/ingest-test/test/Dockerfile.pure:
--------------------------------------------------------------------------------
 1 | FROM python:3.5
 2 | 
 3 | ARG PIP_PACKAGE="irods-capability-automated-ingest"
 4 | 
 5 | RUN pip install ${PIP_PACKAGE}
 6 | 
 7 | COPY irods_environment.json /
 8 | 
 9 | ENV TEST_CASE=${TEST_CASE}
10 | 
11 | ENTRYPOINT python -m unittest ${TEST_CASE:-irods_capability_automated_ingest.test.test_irods_sync}
12 | 
13 | #FROM ingest:latest
14 | #ENV TEST_CASE=${TEST_CASE}
15 | #ENTRYPOINT python -m unittest ${TEST_CASE:-irods_capability_automated_ingest.test.test_irods_sync}
16 | 


--------------------------------------------------------------------------------
/docker/ingest-test/test/irods_environment.json:
--------------------------------------------------------------------------------
1 | {
2 |     "irods_host": "icat.example.org",
3 |     "irods_port": 1247,
4 |     "irods_user": "rods",
5 |     "irods_zone_name": "tempZone"
6 | }
7 | 


--------------------------------------------------------------------------------
/docker/ingest-test/test/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -ex
 2 | 
 3 | pip install ${PIP_PACKAGE}
 4 | 
 5 | # Wait until the provider is up and accepting connections.
 6 | until nc -z icat.example.org 1247; do
 7 |     sleep 1
 8 | done
 9 | 
10 | sleep 10
11 | 
12 | python -m unittest -v ${TEST_CASE}
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/__init__.py


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/celery.py:
--------------------------------------------------------------------------------
 1 | from . import custom_event_handler, sync_logging
 2 | 
 3 | from celery import Celery
 4 | from celery.signals import task_prerun, task_postrun
 5 | 
 6 | import traceback
 7 | 
 8 | app = Celery("irods_capability_automated_ingest")
 9 | 
10 | app.conf.update(
11 |     include=[
12 |         "irods_capability_automated_ingest.tasks.delete_tasks",
13 |         "irods_capability_automated_ingest.tasks.filesystem_tasks",
14 |         "irods_capability_automated_ingest.tasks.s3_bucket_tasks",
15 |     ]
16 | )
17 | 
18 | 
19 | @task_prerun.connect()
20 | def task_prerun(task_id=None, task=None, args=None, kwargs=None, **kw):
21 |     meta = args[0]
22 |     if meta["profile"]:
23 |         config = meta["config"]
24 |         profile_log = config.get("profile")
25 |         logger = sync_logging.get_sync_logger(profile_log)
26 |         logger.info(
27 |             "task_prerun",
28 |             event_id=task_id,
29 |             event_name=task.name,
30 |             path=meta.get("path"),
31 |             target=meta.get("target"),
32 |             hostname=task.request.hostname,
33 |             index=current_process().index,
34 |         )
35 | 
36 | 
37 | @task_postrun.connect()
38 | def task_postrun(
39 |     task_id=None, task=None, args=None, kwargs=None, retval=None, state=None, **kw
40 | ):
41 |     meta = args[0]
42 |     if meta["profile"]:
43 |         config = meta["config"]
44 |         profile_log = config.get("profile")
45 |         logger = sync_logging.get_sync_logger(profile_log)
46 |         logger.info(
47 |             "task_postrun",
48 |             event_id=task_id,
49 |             event_name=task.name,
50 |             path=meta.get("path"),
51 |             target=meta.get("target"),
52 |             hostname=task.request.hostname,
53 |             index=current_process().index,
54 |             state=state,
55 |         )
56 | 
57 | 
58 | class RestartTask(app.Task):
59 |     def on_failure(self, exc, task_id, args, kwargs, einfo):
60 |         meta = args[0]
61 |         config = meta["config"]
62 |         job_name = meta["job_name"]
63 |         logger = sync_logging.get_sync_logger(config["log"])
64 |         logger.error(
65 |             "failed_restart",
66 |             path=meta["path"],
67 |             job_name=job_name,
68 |             task_id=task_id,
69 |             exc=exc,
70 |             einfo=einfo,
71 |             traceback=traceback.extract_tb(exc.__traceback__),
72 |         )
73 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/char_map_util.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import hashlib
  3 | import logging
  4 | import re
  5 | import collections
  6 | import string
  7 | import functools
  8 | 
  9 | __all__ = ["translate_path"]
 10 | _regex = re.compile("")
 11 | 
 12 | # use as a key in the character mapping to ensure a regex matches to exactly one character
 13 | 
 14 | 
 15 | def _re_wrapper(regex):
 16 |     return lambda ch: type(ch) is str and len(ch) == 1 and regex.match(ch)
 17 | 
 18 | 
 19 | _string_replace = lambda _string, _map: _string.translate(
 20 |     {ord(k): v for k, v in _map.items() if v is not None}
 21 | )
 22 | 
 23 | _logger = logging.getLogger("char_map_util")
 24 | 
 25 | SEPARATORS = [c for c in "-~_"]
 26 | Allowed = [
 27 |     {},
 28 |     {
 29 |         "separators": "".join(SEPARATORS),
 30 |         "radixchars": string.digits + string.ascii_letters,
 31 |         "punctuation": "".join(
 32 |             sorted(set(string.punctuation) - set(["/"] + SEPARATORS))
 33 |         ),
 34 |     },
 35 | ]
 36 | 
 37 | 
 38 | def _allowed_in_string(s, map_fn):
 39 |     s_new = translate_string(s, map_fn)
 40 |     return "".join(a for a, b in zip(s, s_new) if a == b)
 41 | 
 42 | 
 43 | class InvalidUsage(Exception):
 44 |     pass
 45 | 
 46 | 
 47 | def _update_Allowed(map_fn=None):
 48 |     if len(Allowed) == 2:
 49 |         if map_fn is None:
 50 |             raise InvalidUsage(
 51 |                 "The first call to this function needs a dictionary in map_fn"
 52 |             )
 53 |         d = Allowed.pop()
 54 |         Allowed[0].update((k, _allowed_in_string(v, map_fn)) for k, v in d.items())
 55 |     return Allowed[0]
 56 | 
 57 | 
 58 | def _allowed_of_type(key, map_fn=None):
 59 |     return _update_Allowed(map_fn)[key]
 60 | 
 61 | 
 62 | _fb_hash = hashlib.sha224
 63 | _fb_obj = _fb_hash().digest()
 64 | 
 65 | 
 66 | def _fallback(name=None):
 67 |     if name is None:
 68 |         return _fb_obj
 69 |     else:
 70 |         h = _fb_hash()
 71 |         h.update(name.encode("utf8"))
 72 |         return h.digest()
 73 | 
 74 | 
 75 | _change_encoding_test = lambda c: c
 76 | _change_encoding_default = lambda c: (
 77 |     chr(c).encode("utf8") if type(c) is int else c.encode("utf8")
 78 | )
 79 | 
 80 | 
 81 | # must be called after first use of _encoded_differences()
 82 | def _diffs_encoded_to_suffix(diff_bytes, rxarray=None):
 83 |     if not diff_bytes:
 84 |         return ""
 85 |     number = functools.reduce((lambda a, b: (a << 8) | b), diff_bytes)
 86 |     radixrep = _update_Allowed()["separators"][:1]
 87 |     if rxarray is None:
 88 |         rxarray = _update_Allowed()["radixchars"]
 89 |     L = len(rxarray)
 90 |     while number:
 91 |         number, mod = divmod(number, L)
 92 |         radixrep += rxarray[mod]
 93 |     return radixrep
 94 | 
 95 | 
 96 | def translate_string(s, mp):
 97 |     if not isinstance(mp, dict):
 98 |         mp = collections.OrderedDict(mp)
 99 |     for key, value in mp.items():
100 |         if isinstance(key, tuple):
101 |             s = _string_replace(s, {k: value for k in key})
102 |         elif isinstance(key, _regex.__class__):
103 |             s = key.sub(value, s)
104 |         elif isinstance(key, str):
105 |             s = _string_replace(s, {key: value})
106 |         elif callable(key):
107 |             s = "".join(value if key(c) else c for c in s)
108 |     return s
109 | 
110 | 
111 | def _encoded_differences(filename, MapFn=None, xfunc=_change_encoding_default):
112 |     rx = _allowed_of_type("radixchars", map_fn=MapFn)
113 |     newname = translate_string(filename, MapFn)
114 |     gen = (
115 |         (tuple(xfunc(_) for _ in a), b)
116 |         for a, b in zip(enumerate(filename), newname)
117 |         if a[1] != b
118 |     )
119 |     MaxBytes = len(_fallback())
120 |     encoded_change = b""
121 |     if xfunc is _change_encoding_test:
122 |         return list(gen)
123 |     # Generate suffix from encoded changes or the constant length SHA2 digest, whichever is shorter.
124 |     while True:
125 |         try:
126 |             g = next(gen)
127 |         except StopIteration:
128 |             break
129 |         encoded_change += b"".join(g[0])
130 |         if len(encoded_change) >= MaxBytes:
131 |             _logger.warning("Using SHA2 for {filename=}")
132 |             return newname, _fallback(filename)
133 |     return newname, encoded_change
134 | 
135 | 
136 | def translate_path_element(filename, map_fn, use_suffix=True):
137 |     newname, enc_diffs = _encoded_differences(filename, map_fn)
138 |     if use_suffix:
139 |         suffix = _diffs_encoded_to_suffix(enc_diffs)
140 |         return newname + suffix
141 |     else:
142 |         return newname
143 | 
144 | 
145 | def translate_path(path, mp, translate_function=translate_path_element):
146 |     t_elem = []
147 |     for el in path.split("/"):
148 |         if el == "":
149 |             if not t_elem:
150 |                 t_elem.append("")
151 |             continue
152 |         new_el = translate_function(el, mp)
153 |         t_elem.append(new_el)
154 |     return "/".join(t_elem)
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     # Demonstration
159 |     map_fn = (
160 |         [("!", "~", "0", "1"), "_"],  # map your choice of things to an underscore
161 |         [re.compile("[\u0100-\U00101fff]"), "~"],
162 |     )  # map all non-ascii unicode to a tilde
163 |     m = _update_Allowed(map_fn)
164 |     import pprint
165 | 
166 |     pprint.pprint(m)
167 |     newname, enc_diffs = _encoded_differences(
168 |         "#041!2~93#041!2\u00ff9\U00101010Z", map_fn
169 |     )
170 |     suffix = _diffs_encoded_to_suffix(enc_diffs)
171 |     print(f"newname={newname}\nsuffix={suffix}")
172 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/core.py:
--------------------------------------------------------------------------------
 1 | class Core(object):
 2 |     @classmethod
 3 |     def on_data_obj_create(cls, func, *args, **options):
 4 |         if hasattr(cls, "pre_data_obj_create"):
 5 |             cls.pre_data_obj_create(*args, **options)
 6 | 
 7 |         func(*args, **options)
 8 | 
 9 |         if hasattr(cls, "post_data_obj_create"):
10 |             cls.post_data_obj_create(*args, **options)
11 | 
12 |     @classmethod
13 |     def on_data_obj_modify(cls, func, *args, **options):
14 |         if hasattr(cls, "pre_data_obj_modify"):
15 |             cls.pre_data_obj_modify(*args, **options)
16 | 
17 |         func(*args, **options)
18 | 
19 |         if hasattr(cls, "post_data_obj_modify"):
20 |             cls.post_data_obj_modify(*args, **options)
21 | 
22 |     @classmethod
23 |     def on_data_obj_delete(cls, func, *args, **options):
24 |         if hasattr(cls, "pre_data_obj_delete"):
25 |             cls.pre_data_obj_delete(*args, **options)
26 | 
27 |         func(*args, **options)
28 | 
29 |         if hasattr(cls, "post_data_obj_delete"):
30 |             cls.post_data_obj_delete(*args, **options)
31 | 
32 |     @classmethod
33 |     def on_coll_create(cls, func, *args, **options):
34 |         if hasattr(cls, "pre_coll_create"):
35 |             cls.pre_coll_create(*args, **options)
36 | 
37 |         func(*args, **options)
38 | 
39 |         if hasattr(cls, "post_coll_create"):
40 |             cls.post_coll_create(*args, **options)
41 | 
42 |     @classmethod
43 |     def on_coll_modify(cls, func, *args, **options):
44 |         if hasattr(cls, "pre_coll_modify"):
45 |             cls.pre_coll_modify(*args, **options)
46 | 
47 |         func(*args, **options)
48 | 
49 |         if hasattr(cls, "post_coll_modify"):
50 |             cls.post_coll_modify(*args, **options)
51 | 
52 |     @classmethod
53 |     def on_coll_delete(cls, func, *args, **options):
54 |         if hasattr(cls, "pre_coll_delete"):
55 |             cls.pre_coll_delete(*args, **options)
56 | 
57 |         func(*args, **options)
58 | 
59 |         if hasattr(cls, "post_coll_delete"):
60 |             cls.post_coll_delete(*args, **options)
61 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/custom_event_handler.py:
--------------------------------------------------------------------------------
  1 | from .redis_key import redis_key_handle
  2 | from .redis_utils import get_redis
  3 | 
  4 | import importlib
  5 | import os.path
  6 | import sys
  7 | 
  8 | 
  9 | class custom_event_handler(object):
 10 |     def __init__(self, meta):
 11 |         self.meta = meta.copy()
 12 |         self.logger = self.meta["config"]["log"]
 13 | 
 14 |     def get_module(self, rtn_mod_and_class=False):  # get_ev_handler_class or something
 15 |         r = get_redis(self.meta["config"])
 16 |         key = "event_handler"
 17 | 
 18 |         job_name = self.meta["job_name"]
 19 | 
 20 |         # reconstructing redis key from meta
 21 |         event_handler_key_str = self.meta["event_handler_key"]
 22 |         event_handler_split = event_handler_key_str.split(":/")
 23 |         event_handler_key = redis_key_handle(
 24 |             r, event_handler_split[0], event_handler_split[1]
 25 |         )
 26 | 
 27 |         content_string = event_handler_key.get_value()
 28 | 
 29 |         # getting uuid for file construction
 30 |         event_handler_str = event_handler_key.get_key().split("::")
 31 |         uuid_ = event_handler_str[1]
 32 | 
 33 |         eh_file_name = "event_handler" + job_name.replace(".", "__") + uuid_
 34 |         eh_path = "/tmp/" + eh_file_name + ".py"
 35 | 
 36 |         # if the file does not already exist, create new file
 37 |         if not (os.path.isfile(eh_path)):
 38 |             with open(eh_path, "w") as eh:
 39 |                 eh.write(content_string.decode("utf-8"))
 40 | 
 41 |         # import event_handler module
 42 |         if "/tmp" not in sys.path:
 43 |             sys.path.insert(0, "/tmp")
 44 |         mod = importlib.import_module(eh_file_name)
 45 |         if mod is None:
 46 |             return (None, None) if rtn_mod_and_class else None
 47 | 
 48 |         cls = getattr(mod, key, None)
 49 |         if rtn_mod_and_class:
 50 |             return (mod, cls)
 51 | 
 52 |         return cls
 53 | 
 54 |     def hasattr(self, attr):
 55 |         module = self.get_module()
 56 |         return module is not None and hasattr(module, attr)
 57 | 
 58 |     def call(self, hdlr, logger, func, *args, **options):
 59 |         (mod, cls) = self.get_module(rtn_mod_and_class=True)
 60 |         args = (mod,) + tuple(args)
 61 | 
 62 |         if self.hasattr(hdlr):
 63 |             logger.debug(
 64 |                 "calling ["
 65 |                 + hdlr
 66 |                 + "] in event handler: args = "
 67 |                 + str(args)
 68 |                 + ", options = "
 69 |                 + str(options)
 70 |             )
 71 |             getattr(cls, hdlr)(func, *args, **options)
 72 |         else:
 73 |             func(*args, **options)
 74 | 
 75 |     # attribute getters
 76 |     def max_retries(self):
 77 |         if self.hasattr("max_retries"):
 78 |             module = self.get_module()
 79 |             return module.max_retries(module, self.logger, self.meta)
 80 |         return 0
 81 | 
 82 |     def timeout(self):
 83 |         if self.hasattr("timeout"):
 84 |             module = self.get_module()
 85 |             return module.timeout(module, self.logger, self.meta)
 86 |         return 3600
 87 | 
 88 |     def delay(self, retries):
 89 |         if self.hasattr("delay"):
 90 |             module = self.get_module()
 91 |             return module.delay(module, self.logger, self.meta, retries)
 92 |         return 0
 93 | 
 94 |     def operation(self, session, **options):
 95 |         if self.hasattr("operation"):
 96 |             return self.get_module().operation(session, self.meta, **options)
 97 | 
 98 |         from .utils import Operation
 99 | 
100 |         return Operation.REGISTER_SYNC
101 |         # return None
102 | 
103 |     def to_resource(self, session, **options):
104 |         if self.hasattr("to_resource"):
105 |             return self.get_module().to_resource(session, self.meta, **options)
106 |         return None
107 | 
108 |     def target_path(self, session, **options):
109 |         if self.hasattr("target_path"):
110 |             return self.get_module().target_path(session, self.meta, **options)
111 |         return None
112 | 
113 |     def delete_mode(self):
114 |         if self.hasattr("delete_mode"):
115 |             return self.get_module().delete_mode(self.meta)
116 | 
117 |         from .utils import DeleteMode
118 | 
119 |         return DeleteMode.DO_NOT_DELETE
120 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/examples/__init__.py


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/append.py:
--------------------------------------------------------------------------------
1 | from irods_capability_automated_ingest.core import Core
2 | from irods_capability_automated_ingest.utils import Operation
3 | 
4 | 
5 | class event_handler(Core):
6 |     @staticmethod
7 |     def operation(session, meta, **options):
8 |         return Operation.PUT_APPEND
9 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/append_non_leaf_non_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_APPEND
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/append_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2Root"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_APPEND
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/append_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2a"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_APPEND
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/coll_create_pre_and_post.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from irods_capability_automated_ingest.core import Core
 4 | from irods_capability_automated_ingest.utils import Operation
 5 | 
 6 | OPERATION = Operation.REGISTER_SYNC
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return OPERATION
13 | 
14 |     @staticmethod
15 |     def pre_coll_create(hdlr_mod, logger, session, meta, *args, **options):
16 |         created_collection = meta["target"]
17 |         parent_of_created_collection = "/".join(created_collection.split("/")[:-1])
18 | 
19 |         attribute = "pre_coll_create"
20 |         value = created_collection
21 |         unit = OPERATION.name
22 | 
23 |         coll = session.collections.get(parent_of_created_collection)
24 |         coll.metadata.add(attribute, value, unit)
25 | 
26 |     @staticmethod
27 |     def post_coll_create(hdlr_mod, logger, session, meta, *args, **options):
28 |         created_collection = meta["target"]
29 |         parent_of_created_collection = "/".join(created_collection.split("/")[:-1])
30 | 
31 |         attribute = "post_coll_create"
32 |         value = created_collection
33 |         unit = OPERATION.name
34 | 
35 |         coll = session.collections.get(parent_of_created_collection)
36 |         coll.metadata.add(attribute, value, unit)
37 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/coll_modify_pre_and_post.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from irods_capability_automated_ingest.core import Core
 4 | from irods_capability_automated_ingest.utils import Operation
 5 | 
 6 | OPERATION = Operation.REGISTER_SYNC
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return OPERATION
13 | 
14 |     @staticmethod
15 |     def pre_coll_modify(hdlr_mod, logger, session, meta, *args, **options):
16 |         modified_collection = meta["target"]
17 | 
18 |         attribute = "pre_coll_modify"
19 |         value = meta["job_name"]
20 |         unit = OPERATION.name
21 | 
22 |         coll = session.collections.get(modified_collection)
23 |         coll.metadata.add(attribute, value, unit)
24 | 
25 |     @staticmethod
26 |     def post_coll_modify(hdlr_mod, logger, session, meta, *args, **options):
27 |         modified_collection = meta["target"]
28 | 
29 |         attribute = "post_coll_modify"
30 |         value = meta["job_name"]
31 |         unit = OPERATION.name
32 | 
33 |         coll = session.collections.get(modified_collection)
34 |         coll.metadata.add(attribute, value, unit)
35 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/data_obj_create_pre_and_post.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from irods_capability_automated_ingest.core import Core
 4 | from irods_capability_automated_ingest.utils import Operation
 5 | 
 6 | OPERATION = Operation.REGISTER_SYNC
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return OPERATION
13 | 
14 |     @staticmethod
15 |     def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
16 |         created_data_object_path = meta["target"]
17 |         parent_collection_of_created_data_object = "/".join(
18 |             created_data_object_path.split("/")[:-1]
19 |         )
20 | 
21 |         attribute = "pre_data_obj_create"
22 |         value = created_data_object_path
23 |         unit = OPERATION.name
24 | 
25 |         coll = session.collections.get(parent_collection_of_created_data_object)
26 |         coll.metadata.add(attribute, value, unit)
27 | 
28 |     @staticmethod
29 |     def post_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
30 |         created_data_object_path = meta["target"]
31 |         parent_collection_of_created_data_object = "/".join(
32 |             created_data_object_path.split("/")[:-1]
33 |         )
34 | 
35 |         attribute = "post_data_obj_create"
36 |         value = created_data_object_path
37 |         unit = OPERATION.name
38 | 
39 |         coll = session.collections.get(parent_collection_of_created_data_object)
40 |         coll.metadata.add(attribute, value, unit)
41 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/data_obj_modify_pre_and_post.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from irods_capability_automated_ingest.core import Core
 4 | from irods_capability_automated_ingest.utils import Operation
 5 | 
 6 | OPERATION = Operation.REGISTER_SYNC
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return OPERATION
13 | 
14 |     @staticmethod
15 |     def pre_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options):
16 |         created_data_object_path = meta["target"]
17 |         parent_collection_of_created_data_object = "/".join(
18 |             created_data_object_path.split("/")[:-1]
19 |         )
20 | 
21 |         attribute = "pre_data_obj_modify"
22 |         value = created_data_object_path
23 |         unit = OPERATION.name
24 | 
25 |         coll = session.collections.get(parent_collection_of_created_data_object)
26 |         coll.metadata.add(attribute, value, unit)
27 | 
28 |     @staticmethod
29 |     def post_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options):
30 |         created_data_object_path = meta["target"]
31 |         parent_collection_of_created_data_object = "/".join(
32 |             created_data_object_path.split("/")[:-1]
33 |         )
34 | 
35 |         attribute = "post_data_obj_modify"
36 |         value = created_data_object_path
37 |         unit = OPERATION.name
38 | 
39 |         coll = session.collections.get(parent_collection_of_created_data_object)
40 |         coll.metadata.add(attribute, value, unit)
41 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/metadata.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | from irods.meta import iRODSMeta
 4 | import os
 5 | 
 6 | filesystem_mode = "filesystem::mode"
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def post_data_obj_create(hdlr_mod, logger, session, meta, **options):
12 |         target = meta["target"]
13 |         path = meta["path"]
14 |         s = os.stat(path)
15 |         mode = s.st_mode
16 | 
17 |         obj = session.data_objects.get(target)
18 |         obj.metadata.add(filesystem_mode, str(mode), "")
19 | 
20 |     @staticmethod
21 |     def post_data_obj_modify(hdlr_mod, logger, session, meta, **options):
22 |         target = meta["target"]
23 |         path = meta["path"]
24 |         s = os.stat(path)
25 |         mode = s.st_mode
26 |         obj = session.data_objects.get(target)
27 |         obj.metadata[filesystem_mode] = iRODSMeta(filesystem_mode, str(mode))
28 | 
29 |     @staticmethod
30 |     def operation(session, meta, **options):
31 |         return Operation.REGISTER_SYNC
32 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/no_op.py:
--------------------------------------------------------------------------------
1 | from irods_capability_automated_ingest.core import Core
2 | from irods_capability_automated_ingest.utils import Operation
3 | 
4 | 
5 | class event_handler(Core):
6 |     @staticmethod
7 |     def operation(session, meta, **options):
8 |         return Operation.NO_OP
9 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/no_retry.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | from irods_capability_automated_ingest.redis_utils import get_redis
 4 | 
 5 | 
 6 | class event_handler(Core):
 7 |     @staticmethod
 8 |     def operation(session, meta, **options):
 9 |         return Operation.NO_OP
10 | 
11 |     @staticmethod
12 |     def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
13 |         target = meta["target"]
14 |         path = meta["path"]
15 | 
16 |         r = get_redis(meta["config"])
17 |         failures = r.get("failures:" + path)
18 |         if failures is None:
19 |             failures = 0
20 | 
21 |         r.incr("failures:" + path)
22 | 
23 |         if failures == 0:
24 |             raise RuntimeError("no failures")
25 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/post_job.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.NO_OP
 9 | 
10 |     @staticmethod
11 |     def post_job(hdlr_mod, logger, meta):
12 |         # Amend here for testing so that we can ensure that post_job executes once per job.
13 |         with open("/tmp/a", "a") as f:
14 |             f.write("post_job")
15 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/pre_job.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.NO_OP
 9 | 
10 |     @staticmethod
11 |     def pre_job(hdlr_mod, logger, meta):
12 |         with open("/tmp/a", "w") as f:
13 |             f.write("pre_job")
14 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/put.py:
--------------------------------------------------------------------------------
1 | from irods_capability_automated_ingest.core import Core
2 | from irods_capability_automated_ingest.utils import Operation
3 | 
4 | 
5 | class event_handler(Core):
6 |     @staticmethod
7 |     def operation(session, meta, **options):
8 |         return Operation.PUT
9 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/put_non_leaf_non_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/put_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2Root"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/put_using_char_map.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | import re
 4 | 
 5 | 
 6 | class event_handler(Core):
 7 |     re_non_alphanum = re.compile("[^a-zA-Z0-9]")
 8 | 
 9 |     @staticmethod
10 |     def character_map():
11 |         return {
12 |             event_handler.re_non_alphanum: "_"
13 |         }  # map any non-ascii or non-alphanumeric
14 |         # character to '_'
15 | 
16 |     @staticmethod
17 |     def operation(session, meta, **options):
18 |         return Operation.PUT
19 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/put_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2a"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register.py:
--------------------------------------------------------------------------------
1 | from irods_capability_automated_ingest.core import Core
2 | from irods_capability_automated_ingest.utils import Operation
3 | 
4 | 
5 | class event_handler(Core):
6 |     @staticmethod
7 |     def operation(session, meta, **options):
8 |         return Operation.REGISTER_SYNC
9 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register_non_leaf_non_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.REGISTER_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2Root"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.REGISTER_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register_using_char_map.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | import re
 4 | 
 5 | 
 6 | class event_handler(Core):
 7 |     re_non_alphanum = re.compile("[^a-zA-Z0-9]")
 8 | 
 9 |     @staticmethod
10 |     def character_map():
11 |         return {
12 |             event_handler.re_non_alphanum: "_"
13 |         }  # map any non-ascii or non-alphanumeric
14 |         # character to '_'
15 | 
16 |     @staticmethod
17 |     def operation(session, meta, **options):
18 |         return Operation.REGISTER_SYNC
19 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register_with_peps.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.REGISTER_SYNC
 9 | 
10 |     @staticmethod
11 |     def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
12 |         logical_path = meta["target"]
13 |         logger.info("pre_data_obj_create:[" + logical_path + "]")
14 | 
15 |     @staticmethod
16 |     def post_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
17 |         logical_path = meta["target"]
18 |         logger.info("post_data_obj_create:[" + logical_path + "]")
19 | 
20 |     @staticmethod
21 |     def pre_coll_create(hdlr_mod, logger, session, meta, *args, **options):
22 |         logical_path = meta["target"]
23 |         logger.info("pre_coll_create:[" + logical_path + "]")
24 | 
25 |     @staticmethod
26 |     def post_coll_create(hdlr_mod, logger, session, meta, *args, **options):
27 |         logical_path = meta["target"]
28 |         logger.info("post_coll_create:[" + logical_path + "]")
29 | 
30 |     @staticmethod
31 |     def pre_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options):
32 |         logical_path = meta["target"]
33 |         logger.info("pre_data_obj_modify:[" + logical_path + "]")
34 | 
35 |     @staticmethod
36 |     def post_data_obj_modify(hdlr_mod, logger, session, meta, *args, **options):
37 |         logical_path = meta["target"]
38 |         logger.info("post_data_obj_modify:[" + logical_path + "]")
39 | 
40 |     @staticmethod
41 |     def pre_coll_modify(hdlr_mod, logger, session, meta, *args, **options):
42 |         logical_path = meta["target"]
43 |         logger.info("pre_coll_modify:[" + logical_path + "]")
44 | 
45 |     @staticmethod
46 |     def post_coll_modify(hdlr_mod, logger, session, meta, *args, **options):
47 |         logical_path = meta["target"]
48 |         logger.info("post_coll_modify:[" + logical_path + "]")
49 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/register_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2a"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.REGISTER_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/replica_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.REGISTER_AS_REPLICA_SYNC
 9 | 
10 |     @staticmethod
11 |     def to_resource(session, meta, **options):
12 |         return "regiResc2Root"
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/replica_with_non_leaf_non_root_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.REGISTER_AS_REPLICA_SYNC
 9 | 
10 |     @staticmethod
11 |     def to_resource(session, meta, **options):
12 |         return "regiResc2"
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/replica_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def operation(session, meta, **options):
 8 |         return Operation.REGISTER_AS_REPLICA_SYNC
 9 | 
10 |     @staticmethod
11 |     def to_resource(session, meta, **options):
12 |         return "regiResc2a"
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/retry.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | from irods_capability_automated_ingest.redis_utils import get_redis
 4 | 
 5 | 
 6 | class event_handler(Core):
 7 |     @staticmethod
 8 |     def max_retries(hdlr_mod, logger, meta):
 9 |         return 3
10 | 
11 |     @staticmethod
12 |     def delay(hdlr_mod, logger, meta, retries):
13 |         return 0
14 | 
15 |     @staticmethod
16 |     def operation(session, meta, **options):
17 |         return Operation.NO_OP
18 | 
19 |     @staticmethod
20 |     def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
21 |         path = meta["path"]
22 |         target = meta["target"]
23 | 
24 |         r = get_redis(meta["config"])
25 |         failures = r.get("failures:" + path)
26 |         if failures is None:
27 |             failures = 0
28 | 
29 |         r.incr("failures:" + path)
30 | 
31 |         if failures == 0:
32 |             raise RuntimeError("no failures")
33 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/statistics.py:
--------------------------------------------------------------------------------
 1 | from .. import redis_key
 2 | from ..redis_utils import get_redis
 3 | from irods_capability_automated_ingest.core import Core
 4 | from irods_capability_automated_ingest.utils import Operation
 5 | 
 6 | import time
 7 | 
 8 | 
 9 | class event_handler(Core):
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.NO_OP
13 | 
14 |     @staticmethod
15 |     def pre_job(hdlr_mod, logger, meta):
16 |         job_name = meta["job_name"]
17 |         config = meta["config"]
18 |         r = get_redis(config)
19 | 
20 |         t0 = time.time()
21 |         t0_key_handle = redis_key.redis_key_handle(r, "t0", job_name)
22 |         t0_key_handle.set_value(t0)
23 | 
24 |     @staticmethod
25 |     def post_job(hdlr_mod, logger, meta):
26 |         job_name = meta["job_name"]
27 |         config = meta["config"]
28 |         t1 = time.time()
29 |         r = get_redis(config)
30 |         t0_key_handle = redis_key.redis_key_handle(r, "t0", job_name)
31 |         t0 = t0_key_handle.get_value()
32 |         t0_key_handle.reset()
33 |         failures = redis_key.failures_key_handle(r, job_name)
34 |         retries = redis_key.retries_key_handle(r, job_name)
35 |         logger.info(
36 |             "post_job",
37 |             job_name=job_name,
38 |             failures=failures.get_value(),
39 |             retries=retries.get_value(),
40 |             time_elasped=t1 - t0,
41 |         )
42 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/sync.py:
--------------------------------------------------------------------------------
1 | from irods_capability_automated_ingest.core import Core
2 | from irods_capability_automated_ingest.utils import Operation
3 | 
4 | 
5 | class event_handler(Core):
6 |     @staticmethod
7 |     def operation(session, meta, **options):
8 |         return Operation.PUT_SYNC
9 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/sync_non_leaf_non_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/sync_retry.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def max_retries(hdlr_mod, logger, meta):
 8 |         return 3
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/sync_root_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2Root"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/sync_with_resc_name.py:
--------------------------------------------------------------------------------
 1 | from irods_capability_automated_ingest.core import Core
 2 | from irods_capability_automated_ingest.utils import Operation
 3 | 
 4 | 
 5 | class event_handler(Core):
 6 |     @staticmethod
 7 |     def to_resource(session, meta, **options):
 8 |         return "regiResc2a"
 9 | 
10 |     @staticmethod
11 |     def operation(session, meta, **options):
12 |         return Operation.PUT_SYNC
13 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/examples/timeout.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from irods_capability_automated_ingest.core import Core
 3 | from irods_capability_automated_ingest.utils import Operation
 4 | 
 5 | 
 6 | class event_handler(Core):
 7 |     @staticmethod
 8 |     def operation(session, meta, **options):
 9 |         return Operation.NO_OP
10 | 
11 |     @staticmethod
12 |     def timeout(hdlr_mod, logger, meta):
13 |         return 1
14 | 
15 |     @staticmethod
16 |     def pre_data_obj_create(hdlr_mod, logger, session, meta, *args, **options):
17 |         time.sleep(2)
18 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/irods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/irods/__init__.py


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/irods/filesystem.py:
--------------------------------------------------------------------------------
  1 | from . import irods_utils
  2 | from .. import custom_event_handler
  3 | from ..utils import Operation
  4 | 
  5 | from irods.models import Resource, DataObject, Collection
  6 | import irods.keywords as kw
  7 | 
  8 | import base64
  9 | import os
 10 | 
 11 | 
 12 | def append_to_data_object(
 13 |     session, logger, source_physical_path, destination_logical_path, **options
 14 | ):
 15 |     BUFFER_SIZE = 1024
 16 |     logger.info(
 17 |         f"appending object {source_physical_path} from local filesystem, options = {options}"
 18 |     )
 19 |     tsize = irods_utils.size(session, destination_logical_path)
 20 |     destination_fd = session.data_objects.open(destination_logical_path, "a", **options)
 21 |     destination_fd.seek(tsize)
 22 |     with open(source_physical_path, "rb") as source_fd:
 23 |         source_fd.seek(tsize)
 24 |         while True:
 25 |             buf = source_fd.read(BUFFER_SIZE)
 26 |             if buf == b"":
 27 |                 break
 28 |             destination_fd.write(buf)
 29 |     destination_fd.close()
 30 |     logger.info("succeeded", task="irods_FSappend_file", path=source_physical_path)
 31 | 
 32 | 
 33 | def register_file(hdlr_mod, logger, session, meta, **options):
 34 |     dest_dataobj_logical_fullpath = meta["target"]
 35 |     source_physical_fullpath = meta["path"]
 36 |     b64_path_str = meta.get("b64_path_str")
 37 | 
 38 |     event_handler = custom_event_handler.custom_event_handler(meta)
 39 |     if event_handler is None:
 40 |         phypath_to_register_in_catalog = None
 41 |     else:
 42 |         phypath_to_register_in_catalog = event_handler.target_path(session, **options)
 43 |     if phypath_to_register_in_catalog is None:
 44 |         if b64_path_str is not None and "unicode_error_filename" in meta:
 45 |             phypath_to_register_in_catalog = os.path.join(
 46 |                 source_physical_fullpath, meta["unicode_error_filename"]
 47 |             )
 48 |         else:
 49 |             phypath_to_register_in_catalog = source_physical_fullpath
 50 | 
 51 |     resc_name = event_handler.to_resource(session, **options)
 52 |     if resc_name is not None:
 53 |         options["destRescName"] = resc_name
 54 | 
 55 |     if b64_path_str is not None:
 56 |         source_physical_fullpath = base64.b64decode(b64_path_str)
 57 | 
 58 |     options[kw.DATA_SIZE_KW] = str(meta["size"])
 59 |     options[kw.DATA_MODIFY_KW] = str(int(meta["mtime"]))
 60 | 
 61 |     logger.info(
 62 |         "registering object {}, source_physical_fullpath: {}, options = {}".format(
 63 |             dest_dataobj_logical_fullpath, source_physical_fullpath, options
 64 |         )
 65 |     )
 66 |     session.data_objects.register(
 67 |         phypath_to_register_in_catalog, dest_dataobj_logical_fullpath, **options
 68 |     )
 69 | 
 70 |     logger.info("succeeded", task="irods_register_file", path=source_physical_fullpath)
 71 | 
 72 |     irods_utils.annotate_metadata_for_special_data_objs(
 73 |         meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath
 74 |     )
 75 | 
 76 | 
 77 | def upload_file(hdlr_mod, logger, session, meta, op, **options):
 78 |     """
 79 |     Function called from sync_irods.sync_file and sync_irods.upload_file for local files
 80 | 
 81 |     If called from sync_irods.sync_file, determines if it should be an append operation, or a put.
 82 |     If called from sync_irods.upload_file, simply puts the file into iRODS
 83 | 
 84 |     op: the type of operation (one of Operation.PUT, Operation.PUT_APPEND, Operation.PUT_SYNC)
 85 |     """
 86 |     # TODO: Check for op here
 87 | 
 88 |     dest_dataobj_logical_fullpath = meta["target"]
 89 |     source_physical_fullpath = meta["path"]
 90 |     b64_path_str = meta.get("b64_path_str")
 91 |     event_handler = custom_event_handler.custom_event_handler(meta)
 92 |     resc_name = event_handler.to_resource(session, **options)
 93 |     if resc_name is not None:
 94 |         options["destRescName"] = resc_name
 95 | 
 96 |     if b64_path_str is not None:
 97 |         source_physical_fullpath = base64.b64decode(b64_path_str)
 98 | 
 99 |     dest_dataobj_logical_fullpath = meta["target"]
100 |     source_physical_fullpath = meta["path"]
101 |     b64_path_str = meta.get("b64_path_str")
102 |     if b64_path_str is not None:
103 |         source_physical_fullpath = base64.b64decode(b64_path_str)
104 | 
105 |     logger.info(
106 |         f"uploading object {source_physical_fullpath} from local filesystem, options = {options}"
107 |     )
108 |     session.data_objects.put(
109 |         source_physical_fullpath, dest_dataobj_logical_fullpath, **options
110 |     )
111 |     logger.info("succeeded", task="irods_FSupload_file", path=source_physical_fullpath)
112 | 
113 |     irods_utils.annotate_metadata_for_special_data_objs(
114 |         meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath
115 |     )
116 | 
117 | 
118 | def no_op(hdlr_mod, logger, session, meta, **options):
119 |     pass
120 | 
121 | 
122 | def sync_file(hdlr_mod, logger, session, meta, op, **options):
123 |     dest_dataobj_logical_fullpath = meta["target"]
124 |     source_physical_fullpath = meta["path"]
125 |     b64_path_str = meta.get("b64_path_str")
126 | 
127 |     event_handler = custom_event_handler.custom_event_handler(meta)
128 |     resc_name = event_handler.to_resource(session, **options)
129 |     if resc_name is not None:
130 |         options["destRescName"] = resc_name
131 | 
132 |     if b64_path_str is not None:
133 |         source_physical_fullpath = base64.b64decode(b64_path_str)
134 | 
135 |     logger.info(
136 |         "syncing object %s, options = %s" % (dest_dataobj_logical_fullpath, options)
137 |     )
138 | 
139 |     # TODO: Issue #208 - This is incorrect -- where is the register function ??
140 |     # Investigate behavior of sync_file when op is None
141 |     if op is None:
142 |         op = Operation.REGISTER_SYNC
143 | 
144 |     # PUT_APPEND with existing file. sync_file assumes the file already exists.
145 |     if op == Operation.PUT_APPEND:
146 |         append_to_data_object(
147 |             session,
148 |             logger,
149 |             source_physical_fullpath,
150 |             dest_dataobj_logical_fullpath,
151 |             **options,
152 |         )
153 | 
154 |     # If data object doesn't exist, just do a standard put
155 |     # If data object does exist, we know op=PUT_SYNC, and we re-copy whole file again, so it is fine also
156 |     else:
157 |         logger.info(
158 |             f"uploading object {source_physical_fullpath} from local filesystem, options = {options}"
159 |         )
160 |         session.data_objects.put(
161 |             source_physical_fullpath, dest_dataobj_logical_fullpath, **options
162 |         )
163 |         logger.info(
164 |             "succeeded", task="irods_FSupload_file", path=source_physical_fullpath
165 |         )
166 | 
167 | 
168 | def update_metadata(hdlr_mod, logger, session, meta, **options):
169 |     dest_dataobj_logical_fullpath = meta["target"]
170 |     source_physical_fullpath = meta["path"]
171 |     event_handler = custom_event_handler.custom_event_handler(meta)
172 |     phypath_to_register_in_catalog = event_handler.target_path(session, **options)
173 |     b64_path_str = meta.get("b64_path_str")
174 |     if phypath_to_register_in_catalog is None:
175 |         if b64_path_str is not None and "unicode_error_filename" in meta:
176 |             # Append generated filename to truncated fullpath because it failed to encode
177 |             # TODO(#250): This will not work on Windows.
178 |             phypath_to_register_in_catalog = os.path.join(
179 |                 source_physical_fullpath, meta["unicode_error_filename"]
180 |             )
181 |         else:
182 |             phypath_to_register_in_catalog = source_physical_fullpath
183 | 
184 |     if b64_path_str is not None:
185 |         source_physical_fullpath = base64.b64decode(b64_path_str)
186 | 
187 |     size = int(meta["size"])
188 |     mtime = int(meta["mtime"])
189 |     logger.info(
190 |         f"updating object: {dest_dataobj_logical_fullpath}, options = {options}"
191 |     )
192 | 
193 |     data_obj_info = {"objPath": dest_dataobj_logical_fullpath}
194 | 
195 |     outdated_repl_nums = []
196 |     found = False
197 | 
198 |     resc_name = event_handler.to_resource(session, **options)
199 |     if resc_name is None:
200 |         found = True
201 |     else:
202 |         for row in session.query(
203 |             Resource.name, DataObject.path, DataObject.replica_number
204 |         ).filter(
205 |             # TODO(#250): This will not work on Windows.
206 |             DataObject.name == os.path.basename(dest_dataobj_logical_fullpath),
207 |             Collection.name == os.path.dirname(dest_dataobj_logical_fullpath),
208 |         ):
209 |             if row[DataObject.path] == phypath_to_register_in_catalog:
210 |                 if irods_utils.child_of(session, row[Resource.name], resc_name):
211 |                     found = True
212 |                     repl_num = row[DataObject.replica_number]
213 |                     data_obj_info["replNum"] = repl_num
214 |                     continue
215 | 
216 |     if not found:
217 |         if b64_path_str is not None:
218 |             logger.error(
219 |                 "updating object: wrong resource or path, "
220 |                 "dest_dataobj_logical_fullpath = {}, phypath_to_register_in_catalog = {}, options = {}".format(
221 |                     dest_dataobj_logical_fullpath,
222 |                     phypath_to_register_in_catalog,
223 |                     str(options),
224 |                 )
225 |             )
226 |         else:
227 |             logger.error(
228 |                 "updating object: wrong resource or path, "
229 |                 "dest_dataobj_logical_fullpath = {}, source_physical_fullpath = {}, "
230 |                 "phypath_to_register_in_catalog = {}, options = {}".format(
231 |                     dest_dataobj_logical_fullpath,
232 |                     source_physical_fullpath,
233 |                     phypath_to_register_in_catalog,
234 |                     str(options),
235 |                 )
236 |             )
237 |         raise Exception("wrong resource or path")
238 | 
239 |     session.data_objects.modDataObjMeta(
240 |         data_obj_info,
241 |         {"dataSize": size, "dataModify": mtime, "allReplStatus": 1},
242 |         **options,
243 |     )
244 | 
245 |     if b64_path_str is not None:
246 |         logger.info(
247 |             "succeeded",
248 |             task="irods_update_metadata",
249 |             path=phypath_to_register_in_catalog,
250 |         )
251 |     else:
252 |         logger.info(
253 |             "succeeded", task="irods_update_metadata", path=source_physical_fullpath
254 |         )
255 | 
256 | 
257 | def sync_file_meta(hdlr_mod, logger, session, meta, **options):
258 |     pass
259 | 
260 | 
261 | def sync_data_from_file(hdlr_mod, meta, logger, content, **options):
262 |     target = meta["target"]
263 |     path = meta["path"]
264 | 
265 |     event_handler = custom_event_handler.custom_event_handler(meta)
266 |     session = irods_utils.irods_session(
267 |         event_handler.get_module(), meta, logger, **options
268 |     )
269 | 
270 |     if meta.get("initial_ingest"):
271 |         # If the initial_ingest option has been specified, no checking is done for the existence
272 |         # of the logical path for performance reasons. If the option is specified and the logical
273 |         # path exists, an error will occur; this behavior is expected.
274 |         exists = False
275 |     else:
276 |         exists = session.data_objects.exists(target)
277 |         if not exists and session.collections.exists(target):
278 |             raise Exception(f"sync: cannot sync file {path} to collection {target}")
279 | 
280 |     op = event_handler.operation(session, **options)
281 | 
282 |     if op == Operation.NO_OP:
283 |         if not exists:
284 |             event_handler.call(
285 |                 "on_data_obj_create", logger, no_op, logger, session, meta, **options
286 |             )
287 |         else:
288 |             event_handler.call(
289 |                 "on_data_obj_modify", logger, no_op, logger, session, meta, **options
290 |             )
291 |     else:
292 |         # allow_redirect will cause PRC to establish a direct connection between the client and the server hosting the
293 |         # resource to which the data is being uploaded. This can cause problems if the hostnames being used in the
294 |         # client environment and the hostname used for the "location" of the resource differ despite referring to the
295 |         # same host. As such, we set the allow_redirect option to False in order to prevent this redirect.
296 |         options["allow_redirect"] = False
297 | 
298 |         createRepl = False
299 |         if op is None:
300 |             op = Operation.REGISTER_SYNC
301 |         elif exists and op == Operation.REGISTER_AS_REPLICA_SYNC:
302 |             resc_name = event_handler.to_resource(session, **options)
303 |             if resc_name is None:
304 |                 raise Exception("no resource name defined")
305 | 
306 |             found = False
307 |             foundPath = False
308 |             for replica in session.data_objects.get(target).replicas:
309 |                 if irods_utils.child_of(session, replica.resource_name, resc_name):
310 |                     found = True
311 |                     if replica.path == path:
312 |                         foundPath = True
313 |             if not found:
314 |                 createRepl = True
315 |             elif not foundPath:
316 |                 raise Exception(
317 |                     f"Data object [{target}] has at least one replica under resource [{resc_name}], but none of the replicas have physical paths which match [{path}]."
318 |                 )
319 | 
320 |         put = op in [Operation.PUT, Operation.PUT_SYNC, Operation.PUT_APPEND]
321 | 
322 |         if not exists:
323 |             meta2 = meta.copy()
324 |             # TODO(#250): This will not work on Windows.
325 |             meta2["target"] = os.path.dirname(target)
326 |             if "b64_path_str" not in meta2:
327 |                 # TODO: This will not work on Windows.
328 |                 meta2["path"] = os.path.dirname(path)
329 |             irods_utils.create_dirs(logger, session, meta2, **options)
330 |             if put:
331 |                 event_handler.call(
332 |                     "on_data_obj_create",
333 |                     logger,
334 |                     upload_file,
335 |                     logger,
336 |                     session,
337 |                     meta,
338 |                     op,
339 |                     **options,
340 |                 )
341 |             else:
342 |                 event_handler.call(
343 |                     "on_data_obj_create",
344 |                     logger,
345 |                     register_file,
346 |                     logger,
347 |                     session,
348 |                     meta,
349 |                     **options,
350 |                 )
351 |         elif createRepl:
352 |             options["regRepl"] = ""
353 | 
354 |             event_handler.call(
355 |                 "on_data_obj_create",
356 |                 logger,
357 |                 register_file,
358 |                 logger,
359 |                 session,
360 |                 meta,
361 |                 **options,
362 |             )
363 |         elif content:
364 |             if put:
365 |                 if Operation.PUT == op:
366 |                     logger.debug(
367 |                         f"PUT operation will ignore existing data object [{meta['target']}]"
368 |                     )
369 |                 else:
370 |                     # PUT_SYNC and PUT_APPEND sync data on existing data objects.
371 |                     event_handler.call(
372 |                         "on_data_obj_modify",
373 |                         logger,
374 |                         sync_file,
375 |                         logger,
376 |                         session,
377 |                         meta,
378 |                         op,
379 |                         **options,
380 |                     )
381 |             else:
382 |                 event_handler.call(
383 |                     "on_data_obj_modify",
384 |                     logger,
385 |                     update_metadata,
386 |                     logger,
387 |                     session,
388 |                     meta,
389 |                     **options,
390 |                 )
391 |         else:
392 |             event_handler.call(
393 |                 "on_data_obj_modify",
394 |                 logger,
395 |                 sync_file_meta,
396 |                 logger,
397 |                 session,
398 |                 meta,
399 |                 **options,
400 |             )
401 | 
402 |     irods_utils.start_timer()
403 | 
404 | 
405 | def sync_metadata_from_file(hdlr_mod, meta, logger, **options):
406 |     sync_data_from_file(hdlr_mod, meta, logger, False, **options)
407 | 
408 | 
409 | def sync_dir_meta(hdlr_mod, logger, session, meta, **options):
410 |     pass
411 | 
412 | 
413 | def sync_data_from_dir(hdlr_mod, meta, logger, content, **options):
414 |     target = meta["target"]
415 |     path = meta["path"]
416 | 
417 |     event_handler = custom_event_handler.custom_event_handler(meta)
418 |     session = irods_utils.irods_session(
419 |         event_handler.get_module(), meta, logger, **options
420 |     )
421 |     exists = session.collections.exists(target)
422 | 
423 |     # TODO(#208): Should we default to REGISTER_SYNC?
424 |     op = event_handler.operation(session, **options) or Operation.REGISTER_SYNC
425 |     if op == Operation.NO_OP:
426 |         operation_name = "on_coll_modify" if exists else "on_coll_create"
427 |         event_handler.call(
428 |             operation_name, logger, no_op, logger, session, meta, **options
429 |         )
430 |     else:
431 |         if not exists:
432 |             irods_utils.create_dirs(logger, session, meta, **options)
433 |         else:
434 |             event_handler.call(
435 |                 "on_coll_modify",
436 |                 logger,
437 |                 sync_dir_meta,
438 |                 logger,
439 |                 session,
440 |                 meta,
441 |                 **options,
442 |             )
443 |     irods_utils.start_timer()
444 | 
445 | 
446 | def sync_metadata_from_dir(hdlr_mod, meta, logger, **options):
447 |     sync_data_from_dir(hdlr_mod, meta, logger, False, **options)
448 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/irods/irods_utils.py:
--------------------------------------------------------------------------------
  1 | from .. import custom_event_handler, sync_logging
  2 | from ..redis_utils import get_redis
  3 | from ..utils import DeleteMode, Operation
  4 | 
  5 | from irods.exception import CollectionDoesNotExist, NetworkException
  6 | from irods.models import Collection, DataObject, Resource
  7 | from irods.session import iRODSSession
  8 | 
  9 | import base64
 10 | import json
 11 | import os
 12 | import redis_lock
 13 | import ssl
 14 | import threading
 15 | 
 16 | irods_session_map = {}
 17 | irods_session_timer_map = {}
 18 | 
 19 | 
 20 | class disconnect_timer(object):
 21 |     def __init__(self, logger, interval, sess_map):
 22 |         self.logger = logger
 23 |         self.interval = interval
 24 |         self.timer = None
 25 |         self.sess_map = sess_map
 26 | 
 27 |     def callback(self):
 28 |         for k, v in self.sess_map.items():
 29 |             self.logger.info("Cleaning up session [" + k + "]")
 30 |             v.cleanup()
 31 |         self.sess_map.clear()
 32 | 
 33 |     def cancel(self):
 34 |         if self.timer is not None:
 35 |             self.timer.cancel()
 36 | 
 37 |     def start(self):
 38 |         self.timer = threading.Timer(self.interval, self.callback)
 39 |         self.timer.start()
 40 | 
 41 | 
 42 | def stop_timer():
 43 |     for k, v in irods_session_timer_map.items():
 44 |         v.cancel()
 45 | 
 46 | 
 47 | def start_timer():
 48 |     for k, v in irods_session_timer_map.items():
 49 |         v.start()
 50 | 
 51 | 
 52 | def irods_session(handler_module, meta, logger, **options):
 53 |     env_irods_host = os.environ.get("IRODS_HOST")
 54 |     env_irods_port = os.environ.get("IRODS_PORT")
 55 |     env_irods_user_name = os.environ.get("IRODS_USER_NAME")
 56 |     env_irods_zone_name = os.environ.get("IRODS_ZONE_NAME")
 57 |     env_irods_password = os.environ.get("IRODS_PASSWORD")
 58 | 
 59 |     env_file = os.environ.get("IRODS_ENVIRONMENT_FILE")
 60 | 
 61 |     kwargs = {}
 62 |     if all(
 63 |         [
 64 |             env_irods_host,
 65 |             env_irods_port,
 66 |             env_irods_user_name,
 67 |             env_irods_zone_name,
 68 |             env_irods_password,
 69 |         ]
 70 |     ):
 71 |         kwargs["host"] = env_irods_host
 72 |         kwargs["port"] = env_irods_port
 73 |         kwargs["user"] = env_irods_user_name
 74 |         kwargs["zone"] = env_irods_zone_name
 75 |         kwargs["password"] = env_irods_password
 76 |     else:
 77 |         if not env_file:
 78 |             # TODO(#250): This will not work on Windows.
 79 |             env_file = os.path.expanduser("~/.irods/irods_environment.json")
 80 | 
 81 |         kwargs["irods_env_file"] = env_file
 82 | 
 83 |     if hasattr(handler_module, "as_user"):
 84 |         client_zone, client_user = handler_module.as_user(meta, **options)
 85 |         kwargs["client_user"] = client_user
 86 |         kwargs["client_zone"] = client_zone
 87 | 
 88 |     key = json.dumps(kwargs)  # todo add timestamp of env file to key
 89 | 
 90 |     if env_file:
 91 |         if not os.path.exists(env_file):
 92 |             raise FileNotFoundError(
 93 |                 f"Specified iRODS client environment file [{env_file}] does not exist."
 94 |             )
 95 | 
 96 |         with open(env_file) as irods_env:
 97 |             irods_env_as_json = json.load(irods_env)
 98 |             verify_server = irods_env_as_json.get("irods_ssl_verify_server")
 99 |             ca_file = irods_env_as_json.get("irods_ssl_ca_certificate_file")
100 |             if verify_server and verify_server != "none" and ca_file:
101 |                 kwargs["ssl_context"] = ssl.create_default_context(
102 |                     purpose=ssl.Purpose.SERVER_AUTH,
103 |                     cafile=ca_file,
104 |                     capath=None,
105 |                     cadata=None,
106 |                 )
107 | 
108 |     if key in irods_session_map:
109 |         sess = irods_session_map.get(key)
110 |     else:
111 |         # TODO: #42 - pull out 10 into configuration
112 |         for i in range(10):
113 |             try:
114 |                 sess = iRODSSession(**kwargs)
115 |                 irods_session_map[key] = sess
116 |                 break
117 |             except NetworkException:
118 |                 time.sleep(0.1)
119 | 
120 |     # =-=-=-=-=-=-=-
121 |     # disconnect timer
122 |     if key in irods_session_timer_map:
123 |         timer = irods_session_timer_map[key]
124 |         timer.cancel()
125 |         irods_session_timer_map.pop(key, None)
126 |     idle_sec = meta["idle_disconnect_seconds"]
127 |     logger.info("iRODS Idle Time set to: " + str(idle_sec))
128 | 
129 |     timer = disconnect_timer(logger, idle_sec, irods_session_map)
130 |     irods_session_timer_map[key] = timer
131 |     # =-=-=-=-=-=-=-
132 | 
133 |     return sess
134 | 
135 | 
136 | def validate_target_collection(meta, logger):
137 |     # root cannot be the target collection
138 |     destination_collection_logical_path = meta["target"]
139 |     if destination_collection_logical_path == "/":
140 |         raise Exception("Root may only contain collections which represent zones")
141 | 
142 | 
143 | def child_of(session, child_resc_name, resc_name):
144 |     if child_resc_name == resc_name:
145 |         return True
146 |     else:
147 |         while True:
148 |             child_resc = session.resources.get(child_resc_name)
149 |             parent_resc_id = child_resc.parent
150 |             if parent_resc_id is None:
151 |                 break
152 | 
153 |             parent_resc_name = None
154 |             for row in session.query(Resource.name).filter(
155 |                 Resource.id == parent_resc_id
156 |             ):
157 |                 parent_resc_name = row[Resource.name]
158 |             if parent_resc_name == resc_name:
159 |                 return True
160 |             child_resc_name = parent_resc_name
161 |         return False
162 | 
163 | 
164 | def create_dirs(logger, session, meta, **options):
165 |     target = meta["target"]
166 |     path = meta["path"]
167 |     config = meta["config"]
168 |     event_handler = custom_event_handler.custom_event_handler(meta)
169 |     if target.startswith("/"):
170 |         r = get_redis(config)
171 |         if not session.collections.exists(target):
172 |             with redis_lock.Lock(r, "create_dirs:" + target):
173 |                 if not session.collections.exists(target):
174 |                     meta2 = meta.copy()
175 |                     # TODO(#250): This will not work on Windows.
176 |                     meta2["target"] = os.path.dirname(target)
177 |                     meta2["path"] = os.path.dirname(path)
178 |                     # TODO: Does this need to happen after the create call?
179 |                     create_dirs(logger, session, meta2, **options)
180 | 
181 |                     event_handler.call(
182 |                         "on_coll_create",
183 |                         logger,
184 |                         create_dir,
185 |                         logger,
186 |                         session,
187 |                         meta,
188 |                         **options,
189 |                     )
190 |     else:
191 |         raise Exception(
192 |             "create_dirs: relative path; target:[" + target + "]; path:[" + path + "]"
193 |         )
194 | 
195 | 
196 | def create_dir(hdlr_mod, logger, session, meta, **options):
197 |     target = meta["target"]
198 |     path = meta["path"]
199 |     logger.info("creating collection " + target)
200 |     session.collections.create(target)
201 | 
202 | 
203 | def annotate_metadata_for_special_data_objs(
204 |     meta, session, source_physical_fullpath, dest_dataobj_logical_fullpath
205 | ):
206 |     def add_metadata_if_not_present(obj, key, val, unit=None):
207 |         # TODO: If updating/syncing link items, we might want to update the readlink result...
208 |         if key not in obj.metadata.keys():
209 |             obj.metadata.add(key, val, unit)
210 | 
211 |     b64_path_str = meta.get("b64_path_str") or meta.get("b64_path_str_charmap")
212 |     if b64_path_str is not None:
213 |         b64_reason = meta.get("b64_reason")
214 |         if b64_reason in ("UnicodeEncodeError", "character_map"):
215 |             add_metadata_if_not_present(
216 |                 session.data_objects.get(dest_dataobj_logical_fullpath),
217 |                 "irods::automated_ingest::{}".format(b64_reason),
218 |                 b64_path_str,
219 |                 "python3.base64.b64encode(full_path_of_source_file)",
220 |             )
221 | 
222 |     if meta["is_socket"]:
223 |         add_metadata_if_not_present(
224 |             session.data_objects.get(dest_dataobj_logical_fullpath),
225 |             "socket_target",
226 |             "socket",
227 |             "automated_ingest",
228 |         )
229 |     elif meta["is_link"]:
230 |         add_metadata_if_not_present(
231 |             session.data_objects.get(dest_dataobj_logical_fullpath),
232 |             "link_target",
233 |             os.path.join(
234 |                 os.path.dirname(source_physical_fullpath),
235 |                 os.readlink(source_physical_fullpath),
236 |             ),
237 |             "automated_ingest",
238 |         )
239 | 
240 | 
241 | def size(session, path, replica_num=None, resc_name=None):
242 |     args = [
243 |         Collection.name == os.path.dirname(path),
244 |         DataObject.name == os.path.basename(path),
245 |     ]
246 | 
247 |     if replica_num is not None:
248 |         args.append(DataObject.replica_number == replica_num)
249 | 
250 |     if resc_name is not None:
251 |         args.append(DataObject.resource_name == resc_name)
252 | 
253 |     for row in session.query(DataObject.size).filter(*args):
254 |         return int(row[DataObject.size])
255 | 
256 | 
257 | def list_collection(meta, logger, logical_path):
258 |     event_handler = custom_event_handler.custom_event_handler(meta)
259 |     session = irods_session(event_handler.get_module(), meta, logger, **dict())
260 | 
261 |     collection = session.collections.get(logical_path)
262 | 
263 |     return collection.subcollections, collection.data_objects
264 | 
265 | 
266 | def unregister_data_object(hdlr_mod, session, meta, **options):
267 |     config = meta["config"]
268 |     logging_config = config["log"]
269 |     logger = sync_logging.get_sync_logger(logging_config)
270 |     logger.debug(f"calling unregister for [{meta['target']}]")
271 |     session.data_objects.unregister(meta["target"], **options)
272 | 
273 | 
274 | def trash_data_object(hdlr_mod, session, meta, **options):
275 |     config = meta["config"]
276 |     logging_config = config["log"]
277 |     logger = sync_logging.get_sync_logger(logging_config)
278 |     logger.debug(f"calling unlink (trash) for [{meta['target']}]")
279 |     session.data_objects.unlink(meta["target"], **options)
280 | 
281 | 
282 | def unlink_data_object(hdlr_mod, session, meta, **options):
283 |     config = meta["config"]
284 |     logging_config = config["log"]
285 |     logger = sync_logging.get_sync_logger(logging_config)
286 |     logger.debug(f"calling unlink (no trash / force=True) for [{meta['target']}]")
287 |     session.data_objects.unlink(meta["target"], force=True, **options)
288 | 
289 | 
290 | def get_delete_function(delete_mode):
291 |     delete_mode_to_function = {
292 |         DeleteMode.DO_NOT_DELETE: None,
293 |         DeleteMode.UNREGISTER: unregister_data_object,
294 |         DeleteMode.TRASH: trash_data_object,
295 |         DeleteMode.NO_TRASH: unlink_data_object,
296 |     }
297 |     return delete_mode_to_function.get(delete_mode, None)
298 | 
299 | 
300 | def delete_data_object(hdlr_mod, meta, **options):
301 |     logical_path = meta["target"]
302 | 
303 |     event_handler = custom_event_handler.custom_event_handler(meta)
304 | 
305 |     delete_mode = event_handler.delete_mode()
306 |     if DeleteMode.DO_NOT_DELETE == delete_mode:
307 |         # The event handler says "do not delete", so do not delete.
308 |         return
309 | 
310 |     config = meta["config"]
311 |     logging_config = config["log"]
312 |     logger = sync_logging.get_sync_logger(logging_config)
313 | 
314 |     session = irods_session(event_handler.get_module(), meta, logger, **options)
315 | 
316 |     if not session.data_objects.exists(logical_path):
317 |         # There is nothing to do if the data object does not exist.
318 |         return
319 | 
320 |     delete_function = get_delete_function(delete_mode)
321 |     if delete_function is None:
322 |         raise RuntimeError(f"delete_mode [{delete_mode}] is not supported")
323 | 
324 |     event_handler.call(
325 |         "on_data_obj_delete", logger, delete_function, session, meta, **options
326 |     )
327 | 
328 | 
329 | def unregister_collection(hdlr_mod, session, meta, **options):
330 |     config = meta["config"]
331 |     logging_config = config["log"]
332 |     logger = sync_logging.get_sync_logger(logging_config)
333 |     logger.debug(f"calling unregister for [{meta['target']}]")
334 |     # We should only be removing an empty collection, so explicitly do not remove recursively or do a "force" remove.
335 |     options["recurse"] = False
336 |     options["force"] = False
337 |     session.collections.unregister(meta["target"], **options)
338 | 
339 | 
340 | def delete_collection(hdlr_mod, meta, **options):
341 |     logical_path = meta["target"]
342 | 
343 |     event_handler = custom_event_handler.custom_event_handler(meta)
344 | 
345 |     delete_mode = event_handler.delete_mode()
346 |     if DeleteMode.DO_NOT_DELETE == delete_mode:
347 |         # The event handler says "do not delete", so do not delete.
348 |         return
349 | 
350 |     config = meta["config"]
351 |     logging_config = config["log"]
352 |     logger = sync_logging.get_sync_logger(logging_config)
353 | 
354 |     session = irods_session(event_handler.get_module(), meta, logger, **options)
355 | 
356 |     r = get_redis(config)
357 |     with redis_lock.Lock(r, "delete_collection:" + logical_path):
358 |         # This will raise CollectionDoesNotExist if logical_path does not exist.
359 |         collection = session.collections.get(logical_path)
360 | 
361 |         if 0 != len(collection.data_objects) or 0 != len(collection.subcollections):
362 |             logger.debug(
363 |                 f"Collection [{logical_path}] is not empty and will not be removed."
364 |             )
365 |             return
366 | 
367 |         event_handler.call(
368 |             "on_coll_delete", logger, unregister_collection, session, meta, **options
369 |         )
370 | 
371 |     # Attempt to remove the parent collection if it is found to be empty.
372 |     root_target_collection = meta["root_target_collection"]
373 |     parent_collection = "/".join(logical_path.split("/")[:-1])
374 |     if parent_collection == root_target_collection:
375 |         logger.info(f"Cannot remove root target collection [{root_target_collection}]")
376 |         return
377 |     with redis_lock.Lock(r, "delete_collection:" + parent_collection):
378 |         # This will raise CollectionDoesNotExist if logical_path does not exist.
379 |         collection = session.collections.get(parent_collection)
380 |         if 0 != len(collection.data_objects) or 0 != len(collection.subcollections):
381 |             logger.debug(
382 |                 f"Collection [{parent_collection}] is not empty and will not be removed."
383 |             )
384 |             return
385 |         event_handler.call(
386 |             "on_coll_delete", logger, unregister_collection, session, meta, **options
387 |         )
388 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/irods_sync.py:
--------------------------------------------------------------------------------
  1 | from uuid import uuid1
  2 | from . import sync_actions
  3 | import argparse
  4 | import json
  5 | import sys
  6 | 
  7 | 
  8 | def get_config(args):
  9 |     return {
 10 |         "log": {
 11 |             "filename": getattr(args, "log_filename", None),
 12 |             "when": getattr(args, "log_when", None),
 13 |             "interval": getattr(args, "log_interval", None),
 14 |             "level": getattr(args, "log_level", None),
 15 |         },
 16 |         "profile": {
 17 |             "filename": getattr(args, "profile_filename", None),
 18 |             "when": getattr(args, "profile_when", None),
 19 |             "interval": getattr(args, "profile_interval", None),
 20 |             "level": getattr(args, "profile_level", None),
 21 |         },
 22 |         "redis": {
 23 |             "host": args.redis_host,
 24 |             "port": args.redis_port,
 25 |             "db": args.redis_db,
 26 |         },
 27 |     }
 28 | 
 29 | 
 30 | def get_celery_broker_info():
 31 |     from os import environ
 32 | 
 33 |     env_url = environ["CELERY_BROKER_URL"]
 34 |     if env_url is None:
 35 |         host = "localhost"
 36 |         port = 6379
 37 |         db = 0
 38 |     else:
 39 |         url = env_url.split("://")[1].split(":")
 40 |         host = url[0]
 41 |         port = url[1].split("/")[0]
 42 |         db = url[1].split("/")[1]
 43 | 
 44 |     return host, port, db
 45 | 
 46 | 
 47 | class character_map_argument_error(Exception):
 48 |     pass
 49 | 
 50 | 
 51 | # Make sure, if a character_map method is defined for the given event handler, that it
 52 | # returns a dictionary (or argument for construction of dictionary) appropriate within the
 53 | # conventions laid out in the README.  Also, within reason, check any characters explicitly
 54 | # named for remapping. To satisfy the principle of least surprise, they should at least
 55 | # be restricted to being strings of length one.
 56 | 
 57 | 
 58 | def check_event_handler(filename):
 59 |     namespace = {}
 60 |     if filename is not None:
 61 |         exec(open(filename, "r").read(), namespace, namespace)
 62 |         ev_hdlr_class = namespace["event_handler"]
 63 |         char_map_method = getattr(ev_hdlr_class, "character_map", None)
 64 |         error_message = ""
 65 |         if char_map_method:
 66 |             returned = char_map_method()
 67 |             try:
 68 |                 char_mapper = dict(returned)
 69 |             except TypeError:
 70 |                 error_message = "character_map() method must return a dict or iterable of key value tuples"
 71 |                 raise character_map_argument_error(error_message)
 72 |             for key, value in char_mapper.items():
 73 |                 if (
 74 |                     isinstance(key, str)
 75 |                     and len(key) > 1
 76 |                     or isinstance(key, tuple)
 77 |                     and any(len(s) > 1 for s in key)
 78 |                     or isinstance(value, str)
 79 |                     and len(value) > 1
 80 |                 ):
 81 |                     error_message = "character_map()'s returned object should denote only single-character substitutions"
 82 |                     raise character_map_argument_error(error_message)
 83 | 
 84 | 
 85 | def add_arguments(parser):
 86 |     host, port, db = get_celery_broker_info()
 87 | 
 88 |     parser.add_argument(
 89 |         "--log_filename",
 90 |         action="store",
 91 |         type=str,
 92 |         default=None,
 93 |         help="Specify name of log file.",
 94 |     )
 95 |     parser.add_argument(
 96 |         "--log_when",
 97 |         action="store",
 98 |         type=str,
 99 |         default=None,
100 |         help="Specify the type of log_interval (see TimedRotatingFileHandler).",
101 |     )
102 |     parser.add_argument(
103 |         "--log_interval",
104 |         action="store",
105 |         type=int,
106 |         default=None,
107 |         help="Specify the interval with which to rollover the ingest log file.",
108 |     )
109 |     parser.add_argument(
110 |         "--log_level",
111 |         action="store",
112 |         type=str,
113 |         default=None,
114 |         help="Specify minimum level of message to log (DEBUG, INFO, WARNING, ERROR).",
115 |     )
116 |     parser.add_argument(
117 |         "--profile_filename",
118 |         action="store",
119 |         type=str,
120 |         default=None,
121 |         help="Specify name of profile filename.",
122 |     )
123 |     parser.add_argument(
124 |         "--profile_when",
125 |         action="store",
126 |         type=str,
127 |         default=None,
128 |         help="Specify the type of profile_interval (see TimedRotatingFileHandler).",
129 |     )
130 |     parser.add_argument(
131 |         "--profile_interval",
132 |         action="store",
133 |         type=int,
134 |         default=None,
135 |         help="Specify the interval with which to rollover the ingest profile log file.",
136 |     )
137 |     parser.add_argument(
138 |         "--profile_level",
139 |         action="store",
140 |         type=str,
141 |         default=None,
142 |         help="Specify minimum level of message to log for profiling (DEBUG, INFO, WARNING, ERROR).",
143 |     )
144 |     parser.add_argument(
145 |         "--redis_host",
146 |         action="store",
147 |         type=str,
148 |         default=host,
149 |         help="Domain or IP address of Redis host.",
150 |     )
151 |     parser.add_argument(
152 |         "--redis_port",
153 |         action="store",
154 |         type=int,
155 |         default=port,
156 |         help="Port number for Redis.",
157 |     )
158 |     parser.add_argument(
159 |         "--redis_db",
160 |         action="store",
161 |         type=int,
162 |         default=db,
163 |         help="Redis DB number to use for ingest.",
164 |     )
165 | 
166 | 
167 | def handle_start(args):
168 |     ex_file_arg = args.exclude_file_type
169 |     if ex_file_arg != None:
170 |         ex_arg_list = [x.strip() for x in ex_file_arg[0].split(",")]
171 | 
172 |     check_event_handler(args.event_handler)
173 | 
174 |     data = {}
175 |     data["restart_queue"] = args.restart_queue
176 |     data["path_queue"] = args.path_queue
177 |     data["file_queue"] = args.file_queue
178 |     data["target"] = args.target
179 |     data["src_path"] = args.src_path
180 |     data["interval"] = args.interval
181 |     data["job_name"] = args.job_name if args.job_name else str(uuid1())
182 |     data["ignore_cache"] = args.ignore_cache
183 |     data["initial_ingest"] = args.initial_ingest
184 |     data["event_handler"] = args.event_handler
185 |     data["config"] = get_config(args)
186 |     data["synchronous"] = args.synchronous
187 |     data["progress"] = args.progress
188 |     data["profile"] = args.profile
189 |     data["files_per_task"] = args.files_per_task
190 |     data["s3_endpoint_domain"] = args.s3_endpoint_domain
191 |     data["s3_region_name"] = args.s3_region_name
192 |     data["s3_keypair"] = args.s3_keypair
193 |     data["s3_proxy_url"] = args.s3_proxy_url
194 |     data["s3_secure_connection"] = not args.s3_insecure_connection
195 |     data["s3_multipart_chunksize_in_mib"] = args.s3_multipart_chunksize_in_mib
196 |     data["exclude_file_type"] = ex_arg_list
197 |     data["exclude_file_name"] = ["".join(r) for r in args.exclude_file_name]
198 |     data["exclude_directory_name"] = ["".join(r) for r in args.exclude_directory_name]
199 |     data["idle_disconnect_seconds"] = args.irods_idle_disconnect_seconds
200 | 
201 |     return sync_actions.start_job(data)
202 | 
203 | 
204 | def handle_stop(args):
205 |     sync_actions.stop_job(args.job_name, get_config(args))
206 |     return 0
207 | 
208 | 
209 | def handle_watch(args):
210 |     return sync_actions.monitor_job(args.job_name, True, get_config(args))
211 | 
212 | 
213 | def handle_list(args):
214 |     jobs = sync_actions.list_jobs(get_config(args))
215 |     print(json.dumps(jobs))
216 |     return 0
217 | 
218 | 
219 | def main():
220 |     parser = argparse.ArgumentParser(description="continuous synchronization utility")
221 |     subparsers = parser.add_subparsers(help="subcommand help")
222 | 
223 |     parser_start = subparsers.add_parser(
224 |         "start",
225 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
226 |         help="start help",
227 |     )
228 |     parser_start.add_argument(
229 |         "src_path",
230 |         metavar="SOURCE_DIRECTORY",
231 |         type=str,
232 |         help="Source directory or S3 folder to scan.",
233 |     )
234 |     parser_start.add_argument(
235 |         "target",
236 |         metavar="TARGET_COLLECTION",
237 |         type=str,
238 |         help="Target iRODS collection for data objects (created if non-existent).",
239 |     )
240 |     parser_start.add_argument(
241 |         "-i",
242 |         "--interval",
243 |         action="store",
244 |         type=int,
245 |         default=None,
246 |         help="Restart interval (in seconds). If absent, will only sync once.",
247 |     )
248 |     parser_start.add_argument(
249 |         "--file_queue",
250 |         action="store",
251 |         type=str,
252 |         default="file",
253 |         help="Name for the file queue.",
254 |     )
255 |     parser_start.add_argument(
256 |         "--path_queue",
257 |         action="store",
258 |         type=str,
259 |         default="path",
260 |         help="Name for the path queue.",
261 |     )
262 |     parser_start.add_argument(
263 |         "--restart_queue",
264 |         action="store",
265 |         type=str,
266 |         default="restart",
267 |         help="Name for the restart queue.",
268 |     )
269 |     parser_start.add_argument(
270 |         "--event_handler",
271 |         action="store",
272 |         type=str,
273 |         default=None,
274 |         help="Path to event handler file",
275 |     )
276 |     parser_start.add_argument(
277 |         "--job_name",
278 |         action="store",
279 |         type=str,
280 |         default=None,
281 |         help="Reference name for ingest job (defaults to generated uuid)",
282 |     )
283 |     parser_start.add_argument(
284 |         "--ignore_cache",
285 |         action="store_true",
286 |         default=False,
287 |         help="Ignore last sync time in cache - like starting a new sync",
288 |     )
289 |     parser_start.add_argument(
290 |         "--initial_ingest",
291 |         action="store_true",
292 |         default=False,
293 |         help="Use this flag on initial ingest to avoid check for data object paths already in iRODS.",
294 |     )
295 |     parser_start.add_argument(
296 |         "--synchronous",
297 |         action="store_true",
298 |         default=False,
299 |         help="Block until sync job is completed.",
300 |     )
301 |     parser_start.add_argument(
302 |         "--progress",
303 |         action="store_true",
304 |         default=False,
305 |         help="Show progress bar and task counts (must have --synchronous flag).",
306 |     )
307 |     parser_start.add_argument(
308 |         "--profile",
309 |         action="store_true",
310 |         default=False,
311 |         help="Generate JSON file of system activity profile during ingest.",
312 |     )
313 |     parser_start.add_argument(
314 |         "--files_per_task",
315 |         action="store",
316 |         type=int,
317 |         default="50",
318 |         help="Number of paths to process in a given task on the queue.",
319 |     )
320 |     parser_start.add_argument(
321 |         "--s3_endpoint_domain",
322 |         action="store",
323 |         type=str,
324 |         default="s3.amazonaws.com",
325 |         help="S3 endpoint domain",
326 |     )
327 |     parser_start.add_argument(
328 |         "--s3_region_name",
329 |         action="store",
330 |         type=str,
331 |         default="us-east-1",
332 |         help="S3 region name",
333 |     )
334 |     parser_start.add_argument(
335 |         "--s3_keypair",
336 |         action="store",
337 |         type=str,
338 |         default=None,
339 |         help="Path to S3 keypair file",
340 |     )
341 |     parser_start.add_argument(
342 |         "--s3_proxy_url",
343 |         action="store",
344 |         type=str,
345 |         default=None,
346 |         help="URL to proxy for S3 access",
347 |     )
348 |     parser_start.add_argument(
349 |         "--s3_insecure_connection",
350 |         action="store_true",
351 |         default=False,
352 |         help="Do not use SSL when connecting to S3 endpoint",
353 |     )
354 |     parser_start.add_argument(
355 |         "--s3_multipart_chunksize_in_mib",
356 |         action="store",
357 |         type=int,
358 |         default=8,
359 |         choices=range(5, 5001),
360 |         metavar="[5-5000]",
361 |         help="Chunk size in mebibytes for multipart S3 uploads. Minimum part size is 5 MiB and the maximum part size is 5000 MiB.",
362 |     )
363 |     parser_start.add_argument(
364 |         "--exclude_file_type",
365 |         nargs=1,
366 |         action="store",
367 |         default="none",
368 |         help="types of files to exclude: regular, directory, character, block, socket, pipe, link",
369 |     )
370 |     parser_start.add_argument(
371 |         "--exclude_file_name",
372 |         type=list,
373 |         nargs="+",
374 |         action="store",
375 |         default="none",
376 |         help='a list of space-separated python regular expressions defining the file names to exclude such as "(\S+)exclude" "(\S+)\.hidden"',
377 |     )
378 |     parser_start.add_argument(
379 |         "--exclude_directory_name",
380 |         type=list,
381 |         nargs="+",
382 |         action="store",
383 |         default="none",
384 |         help='a list of space-separated python regular expressions defining the directory names to exclude such as "(\S+)exclude" "(\S+)\.hidden"',
385 |     )
386 |     parser_start.add_argument(
387 |         "--irods_idle_disconnect_seconds",
388 |         action="store",
389 |         type=int,
390 |         default=60,
391 |         help="irods disconnect time in seconds",
392 |     )
393 |     add_arguments(parser_start)
394 | 
395 |     parser_start.set_defaults(func=handle_start)
396 | 
397 |     parser_stop = subparsers.add_parser(
398 |         "stop", formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="stop help"
399 |     )
400 |     parser_stop.add_argument("job_name", action="store", type=str, help="job name")
401 |     add_arguments(parser_stop)
402 |     parser_stop.set_defaults(func=handle_stop)
403 | 
404 |     parser_watch = subparsers.add_parser(
405 |         "watch",
406 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
407 |         help="watch help",
408 |     )
409 |     parser_watch.add_argument("job_name", action="store", type=str, help="job name")
410 |     add_arguments(parser_watch)
411 |     parser_watch.set_defaults(func=handle_watch)
412 | 
413 |     parser_list = subparsers.add_parser(
414 |         "list", formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="list help"
415 |     )
416 |     add_arguments(parser_list)
417 |     parser_list.set_defaults(func=handle_list)
418 | 
419 |     args = parser.parse_args()
420 |     sys.exit(args.func(args))
421 | 
422 | 
423 | if __name__ == "__main__":
424 |     main()
425 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/redis_key.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import traceback
  4 | 
  5 | MAX_RETRIES = 10
  6 | 
  7 | 
  8 | # TODO: Consider compression/hashing of key_category and identifier
  9 | class redis_key_handle(object):
 10 |     # def __init__(self, logger, redis_handle, key_category, identifier, delimiter=':/'):
 11 |     def __init__(self, redis_handle, key_category, identifier, delimiter=":/"):
 12 |         # self.logger = logger
 13 |         self.redis_handle = redis_handle
 14 |         self.category = key_category
 15 |         self.identifier = identifier
 16 |         self.delimiter = delimiter
 17 |         # TODO: Hard-coded value from .utils
 18 | 
 19 |     def retry(self, func, *args, max_retries=MAX_RETRIES):
 20 |         retries = 0
 21 |         while retries <= max_retries:
 22 |             try:
 23 |                 res = func(*args)
 24 |                 return res
 25 |             except Exception as err:
 26 |                 retries += 1
 27 | 
 28 |                 # logger.info('Retrying. retries=' + str(retries), max_retries=max_retries, func=func, args=args, err=err, stacktrace=traceback.extract_tb(err.__traceback__))
 29 |                 time.sleep(1)
 30 |         raise RuntimeError("max retries")
 31 | 
 32 |     def get_key(self):
 33 |         return str(self.category + self.delimiter + self.identifier)
 34 | 
 35 |     def get_value(self):
 36 |         if self.get_key() is None:
 37 |             return None
 38 |         return self.retry(self.redis_handle.get, self.get_key())
 39 | 
 40 |     def set_value(self, value):
 41 |         self.retry(self.redis_handle.set, self.get_key(), value)
 42 | 
 43 |     def reset(self):
 44 |         self.retry(self.redis_handle.delete, self.get_key())
 45 | 
 46 | 
 47 | class incremental_redis_key_handle(redis_key_handle):
 48 |     def __init__(self, redis_handle, key_category, identifier, delimiter=":/"):
 49 |         super().__init__(redis_handle, key_category, identifier, delimiter)
 50 | 
 51 |     def get_value(self):
 52 |         val = super().get_value()
 53 |         if val is None:
 54 |             return val
 55 |         return int(val)
 56 | 
 57 |     def incrby(self, amount=1):
 58 |         self.retry(self.redis_handle.incrby, self.get_key(), amount)
 59 | 
 60 |     def incr(self):
 61 |         self.retry(self.redis_handle.incr, self.get_key())
 62 | 
 63 |     def decrby(self, amount=1):
 64 |         self.retry(self.redis_handle.decrby, self.get_key(), amount)
 65 | 
 66 |     def decr(self):
 67 |         return self.retry(self.redis_handle.decr, self.get_key())
 68 | 
 69 | 
 70 | class json_redis_key_handle(redis_key_handle):
 71 |     def __init__(self, redis_handle, key_category, identifier, delimiter=":/"):
 72 |         super().__init__(redis_handle, key_category, identifier, delimiter)
 73 | 
 74 |     # def get_value(self):
 75 |     # return json.loads(self.retry(self.redis_handle.get, self.get_key().decode("utf-8")))
 76 | 
 77 | 
 78 | class list_redis_key_handle(redis_key_handle):
 79 |     def __init__(self, redis_handle, key_category, identifier, delimiter=":/"):
 80 |         super().__init__(redis_handle, key_category, identifier, delimiter)
 81 | 
 82 |     def get_value(self):
 83 |         val = super().get_value()
 84 |         if val is None:
 85 |             return val
 86 |         return list(val)
 87 | 
 88 |     def rpush(self, value):
 89 |         self.retry(self.redis_handle.rpush, self.get_key(), value)
 90 | 
 91 |     def lrange(self, start, end):
 92 |         return self.retry(self.redis_handle.lrange, self.get_key(), start, end)
 93 | 
 94 |     def llen(self):
 95 |         return self.retry(self.redis_handle.llen, self.get_key())
 96 | 
 97 | 
 98 | class float_redis_key_handle(redis_key_handle):
 99 |     def __init__(self, redis_handle, key_category, identifier, delimiter=":/"):
100 |         super().__init__(redis_handle, key_category, identifier, delimiter)
101 | 
102 |     def get_value(self):
103 |         val = super().get_value()
104 |         if val is None:
105 |             return val
106 |         return float(val)
107 | 
108 | 
109 | # TODO(#292): python metaclasses - see PRC
110 | class sync_time_key_handle(float_redis_key_handle):
111 |     """Float indicating the last time path was synced."""
112 | 
113 |     def __init__(self, redis_handle, path):
114 |         super().__init__(redis_handle, "sync_time", path)
115 | 
116 | 
117 | class cleanup_key_handle(json_redis_key_handle):
118 |     """JSON object with list of event_handlers that need to be cleaned up."""
119 | 
120 |     def __init__(self, redis_handle, job_name):
121 |         super().__init__(redis_handle, "cleanup", job_name)
122 | 
123 | 
124 | class stop_key_handle(redis_key_handle):
125 |     """Empty string indicating that the job job_name_to_stop is being stopped."""
126 | 
127 |     def __init__(self, redis_handle, job_name_to_stop):
128 |         super().__init__(redis_handle, "stop", job_name_to_stop)
129 | 
130 |     def get_value(self):
131 |         val = super().get_value()
132 |         if val is None:
133 |             return val
134 |         return str(val)
135 | 
136 | 
137 | class tasks_key_handle(incremental_redis_key_handle):
138 |     """Integer indicating the task count for job_name."""
139 | 
140 |     def __init__(self, redis_handle, job_name):
141 |         super().__init__(redis_handle, "tasks", job_name)
142 | 
143 | 
144 | class count_key_handle(list_redis_key_handle):
145 |     """List of task IDs associated with job_name."""
146 | 
147 |     def __init__(self, redis_handle, job_name):
148 |         super().__init__(redis_handle, "count", job_name)
149 | 
150 | 
151 | # TODO: What is the difference between this list and the set of stop_keys?
152 | class dequeue_key_handle(list_redis_key_handle):
153 |     """List of tasks for a particular job_name."""
154 | 
155 |     def __init__(self, redis_handle, job_name):
156 |         super().__init__(redis_handle, "dequeue", job_name)
157 | 
158 | 
159 | class failures_key_handle(incremental_redis_key_handle):
160 |     """Integer indicating the count of failed tasks for job_name."""
161 | 
162 |     def __init__(self, redis_handle, job_name):
163 |         super().__init__(redis_handle, "failures", job_name)
164 | 
165 | 
166 | class retries_key_handle(incremental_redis_key_handle):
167 |     """Integer indicating the count of tasks which were retried for job_name."""
168 | 
169 |     def __init__(self, redis_handle, job_name):
170 |         super().__init__(redis_handle, "retries", job_name)
171 | 
172 | 
173 | class stopped_jobs_key_handle(json_redis_key_handle):
174 |     """JSON object with list of sync_job dicts."""
175 | 
176 |     def __init__(self, redis_handle):
177 |         super().__init__(redis_handle, "irods_ingest_stopped_jobs", "")
178 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/redis_utils.py:
--------------------------------------------------------------------------------
 1 | from redis import StrictRedis, ConnectionPool
 2 | 
 3 | redis_connection_pool_map = {}
 4 | 
 5 | 
 6 | def get_redis(config):
 7 |     redis_config = config["redis"]
 8 |     host = redis_config["host"]
 9 |     port = redis_config["port"]
10 |     db = redis_config["db"]
11 |     url = "redis://" + host + ":" + str(port) + "/" + str(db)
12 |     pool = redis_connection_pool_map.get(url)
13 |     if pool is None:
14 |         pool = ConnectionPool(host=host, port=port, db=db)
15 |         redis_connection_pool_map[url] = pool
16 | 
17 |     return StrictRedis(connection_pool=pool)
18 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/sync_actions.py:
--------------------------------------------------------------------------------
  1 | from . import sync_logging
  2 | from .irods import irods_utils
  3 | from .redis_key import redis_key_handle
  4 | from .redis_utils import get_redis
  5 | from .sync_job import get_stopped_jobs_list, sync_job
  6 | from .tasks import filesystem_tasks, s3_bucket_tasks
  7 | 
  8 | from os.path import realpath
  9 | from uuid import uuid1
 10 | import json
 11 | import progressbar
 12 | import redis_lock
 13 | import textwrap
 14 | import time
 15 | import uuid
 16 | 
 17 | uuid_ = uuid.uuid4().hex
 18 | 
 19 | 
 20 | def stop_job(job_name, config):
 21 |     logger = sync_logging.get_sync_logger(config["log"])
 22 |     r = get_redis(config)
 23 |     with redis_lock.Lock(r, "lock:periodic"):
 24 |         job = sync_job(job_name, r)
 25 |         if job.cleanup_handle().get_value() is None:
 26 |             logger.error("job [{0}] does not exist".format(job_name))
 27 |             raise Exception("job [{0}] does not exist".format(job_name))
 28 |         job.stop()
 29 | 
 30 | 
 31 | def list_jobs(config):
 32 |     r = get_redis(config)
 33 |     with redis_lock.Lock(r, "lock:periodic"):
 34 |         periodic_jobs = list(
 35 |             map(lambda job_id: job_id.decode("utf-8"), r.lrange("periodic", 0, -1))
 36 |         )
 37 |         singlepass_jobs = list(
 38 |             map(lambda job_id: job_id.decode("utf-8"), r.lrange("singlepass", 0, -1))
 39 |         )
 40 |         jobs_map = {
 41 |             "periodic": [sync_job(job_name, r).asdict() for job_name in periodic_jobs],
 42 |             "singlepass": [
 43 |                 sync_job(job_name, r).asdict() for job_name in singlepass_jobs
 44 |             ],
 45 |             "stopped": get_stopped_jobs_list(r),
 46 |         }
 47 |         return jobs_map
 48 | 
 49 | 
 50 | def monitor_job(job_name, progress, config):
 51 |     logger = sync_logging.get_sync_logger(config["log"])
 52 |     job = sync_job(job_name, get_redis(config))
 53 |     if job.cleanup_handle().get_value() is None:
 54 |         logger.error("job [{0}] does not exist".format(job.name()))
 55 |         raise Exception("job [{0}] does not exist".format(job.name()))
 56 |     try:
 57 |         if not progress:
 58 |             while not job.done() or job.periodic():
 59 |                 time.sleep(1)
 60 |             if job.stopped():
 61 |                 logger.warning(
 62 |                     f"Job [{job.name()}] was stopped and may not have finished."
 63 |                 )
 64 |             failures = job.failures_handle().get_value()
 65 |             if failures is not None and failures != 0:
 66 |                 return -1
 67 |             return 0
 68 |         start_time = job.start_time_handle().get_value()
 69 |         if start_time is None:
 70 |             logger.error(
 71 |                 f"Job [{job.name()}] has no start time. Cannot display progress."
 72 |             )
 73 |             return -1
 74 |         widgets = [
 75 |             " [",
 76 |             progressbar.Variable("timer"),
 77 |             "] ",
 78 |             progressbar.Bar(),
 79 |             " (",
 80 |             progressbar.ETA(),
 81 |             ") ",
 82 |             progressbar.Variable("total"),
 83 |             " ",
 84 |             progressbar.Variable("remaining"),
 85 |             " ",
 86 |             progressbar.Variable("failed"),
 87 |             " ",
 88 |             progressbar.Variable("retried"),
 89 |         ]
 90 |         with progressbar.ProgressBar(
 91 |             max_value=1, widgets=widgets, redirect_stdout=True, redirect_stderr=True
 92 |         ) as bar:
 93 | 
 94 |             def update_pbar():
 95 |                 job_info = job.asdict()
 96 |                 total_tasks = job_info["total_tasks"]
 97 |                 remaining_tasks = job_info["remaining_tasks"]
 98 |                 if total_tasks == 0:
 99 |                     percentage = 0
100 |                 else:
101 |                     percentage = max(
102 |                         0, min(1, (total_tasks - remaining_tasks) / total_tasks)
103 |                     )
104 |                 bar.update(
105 |                     percentage,
106 |                     timer=job_info["elapsed_time"],
107 |                     total=total_tasks,
108 |                     remaining=remaining_tasks,
109 |                     failed=job_info["failed_tasks"],
110 |                     retried=job_info["retried_tasks"],
111 |                 )
112 | 
113 |             while not job.done() or job.periodic():
114 |                 update_pbar()
115 |                 time.sleep(1)
116 |             if job.stopped():
117 |                 logger.warning(
118 |                     f"Job [{job.name()}] was stopped and may not have finished."
119 |                 )
120 |             else:
121 |                 update_pbar()
122 |         failures = job.failures_handle().get_value()
123 |         if failures is not None and failures != 0:
124 |             return -1
125 |         else:
126 |             return 0
127 |     except KeyboardInterrupt:
128 |         logger.info(f"KeyboardInterrupt stopped monitoring of job [{job.name()}].")
129 |         return 0
130 | 
131 | 
132 | def start_job(data):
133 |     config = data["config"]
134 |     logging_config = config["log"]
135 |     src_path = data["src_path"]
136 |     job_name = data["job_name"]
137 |     interval = data["interval"]
138 |     restart_queue = data["restart_queue"]
139 |     sychronous = data["synchronous"]
140 |     progress = data["progress"]
141 |     s3_region_name = data["s3_region_name"]
142 |     s3_endpoint_domain = data["s3_endpoint_domain"]
143 |     s3_keypair = data["s3_keypair"]
144 |     s3_multipart_chunksize = data["s3_multipart_chunksize_in_mib"]
145 |     logger = sync_logging.get_sync_logger(logging_config)
146 |     data_copy = data.copy()
147 | 
148 |     if s3_keypair is not None:
149 |         with open(s3_keypair) as f:
150 |             data_copy["s3_access_key"] = f.readline().rstrip()
151 |             data_copy["s3_secret_key"] = f.readline().rstrip()
152 |         # set source
153 |         src_abs = src_path
154 |         main_task = s3_bucket_tasks.s3_bucket_main_task
155 |     else:
156 |         src_abs = realpath(src_path)
157 |         main_task = filesystem_tasks.filesystem_main_task
158 | 
159 |     data_copy["root"] = src_abs
160 |     data_copy["path"] = src_abs
161 | 
162 |     irods_utils.validate_target_collection(data_copy, logger)
163 | 
164 |     def store_event_handler(data, job):
165 |         event_handler = data.get("event_handler")
166 |         event_handler_data = data.get("event_handler_data")
167 |         event_handler_path = data.get("event_handler_path")
168 | 
169 |         # investigate -- kubernetes
170 |         if (
171 |             event_handler is None
172 |             and event_handler_path is not None
173 |             and event_handler_data is not None
174 |         ):
175 |             event_handler = "event_handler" + uuid1().hex
176 |             hdlr2 = event_handler_path + "/" + event_handler + ".py"
177 |             with open(hdlr2, "w") as f:
178 |                 f.write(event_handler_data)
179 |             cleanup_list = [hdlr2.encode("utf-8")]
180 |             data["event_handler"] = event_handler
181 |         # if no argument is given, use default event_handler
182 |         elif event_handler is None:
183 |             # constructing redis_key and putting default event_handler into redis
184 |             uuid_ = uuid.uuid4().hex
185 |             event_handler_key = redis_key_handle(
186 |                 r, "custom_event_handler", job.name() + "::" + uuid_
187 |             )
188 |             content_string = textwrap.dedent(
189 |                 """
190 |             from irods_capability_automated_ingest.core import Core 
191 |             from irods_capability_automated_ingest.utils import Operation, DeleteMode
192 |             class event_handler(Core):
193 |                 @staticmethod
194 |                 def operation(session, meta, *args, **options):
195 |                     return Operation.REGISTER_SYNC
196 | 
197 |                 @staticmethod
198 |                 def delete_mode(meta):
199 |                     return DeleteMode.DO_NOT_DELETE"""
200 |             )
201 |             event_handler_key.set_value(content_string)
202 | 
203 |             # putting redis_key into meta map
204 |             data_copy["event_handler_key"] = event_handler_key.get_key()
205 | 
206 |             cleanup_list = []
207 |         else:
208 |             # constructing redis_key and putting custom_event_handler into redis
209 |             with open(event_handler, "r") as f:
210 |                 content_string = f.read()
211 | 
212 |             uuid_ = uuid.uuid4().hex
213 |             event_handler_key = redis_key_handle(
214 |                 r, "custom_event_handler", job.name() + "::" + uuid_
215 |             )
216 |             event_handler_key.set_value(content_string)
217 | 
218 |             # putting redis_key into meta map
219 |             data_copy["event_handler_key"] = event_handler_key.get_key()
220 | 
221 |             cleanup_list = []
222 |         job.cleanup_handle().set_value(json.dumps(cleanup_list))
223 | 
224 |     r = get_redis(config)
225 |     job = sync_job.from_meta(data_copy)
226 |     with redis_lock.Lock(r, "lock:periodic"):
227 |         if job.cleanup_handle().get_value() is not None:
228 |             logger.error("job {0} already exists".format(job_name))
229 |             raise Exception("job {0} already exists".format(job_name))
230 | 
231 |         store_event_handler(data_copy, job)
232 | 
233 |     if interval is not None:
234 |         r.rpush("periodic", job_name.encode("utf-8"))
235 | 
236 |         main_task.s(data_copy).apply_async(queue=restart_queue, task_id=job_name)
237 |     else:
238 |         r.rpush("singlepass", job_name.encode("utf-8"))
239 |         if not sychronous:
240 |             main_task.s(data_copy).apply_async(queue=restart_queue)
241 |         else:
242 |             res = main_task.s(data_copy).apply()
243 |             if res.failed():
244 |                 print(res.traceback)
245 |                 job.cleanup()
246 |                 return -1
247 |             else:
248 |                 return monitor_job(job_name, progress, config)
249 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/sync_job.py:
--------------------------------------------------------------------------------
  1 | from . import redis_key
  2 | from .celery import app
  3 | from .redis_utils import get_redis
  4 | 
  5 | import datetime
  6 | import json
  7 | import os
  8 | import progressbar
  9 | import time
 10 | 
 11 | 
 12 | def add_stopped_job(redis_handle, stopped_job_dict):
 13 |     """Add the sync_job dict to the JSON array of stopped jobs tracked in the Redis database."""
 14 |     stopped_jobs_handle = redis_key.stopped_jobs_key_handle(redis_handle)
 15 |     stopped_jobs_value = stopped_jobs_handle.get_value()
 16 |     if stopped_jobs_value is None:
 17 |         stopped_jobs_list = []
 18 |     else:
 19 |         stopped_jobs_list = json.loads(stopped_jobs_value.decode("utf-8"))
 20 |     stopped_jobs_list.append(stopped_job_dict)
 21 |     # TODO(#297): Is it really the caller's responsibility to dump to a string?
 22 |     # stopped_jobs_handle is a json_redis_key_handle, so it ought to handle this for us...
 23 |     stopped_jobs_handle.set_value(json.dumps(stopped_jobs_list))
 24 | 
 25 | 
 26 | def get_stopped_jobs_list(redis_handle):
 27 |     """Get the JSON array of stopped jobs tracked in the Redis database."""
 28 |     stopped_jobs_value = redis_key.stopped_jobs_key_handle(redis_handle).get_value()
 29 |     if stopped_jobs_value is None:
 30 |         return []
 31 |     return json.loads(stopped_jobs_value.decode("utf-8"))
 32 | 
 33 | 
 34 | class sync_job(object):
 35 |     def __init__(self, job_name, redis_handle):
 36 |         self.job_name = job_name
 37 |         self.r = redis_handle
 38 | 
 39 |     @classmethod
 40 |     def from_meta(cls, meta):
 41 |         return cls(meta["job_name"], get_redis(meta["config"]))
 42 | 
 43 |     def name(self):
 44 |         return self.job_name
 45 | 
 46 |     def count_handle(self):
 47 |         return redis_key.count_key_handle(self.r, self.job_name)
 48 | 
 49 |     def dequeue_handle(self):
 50 |         return redis_key.dequeue_key_handle(self.r, self.job_name)
 51 | 
 52 |     def tasks_handle(self):
 53 |         return redis_key.tasks_key_handle(self.r, self.job_name)
 54 | 
 55 |     def failures_handle(self):
 56 |         return redis_key.failures_key_handle(self.r, self.job_name)
 57 | 
 58 |     def retries_handle(self):
 59 |         return redis_key.retries_key_handle(self.r, self.job_name)
 60 | 
 61 |     def stop_handle(self):
 62 |         return redis_key.stop_key_handle(self.r, self.job_name)
 63 | 
 64 |     def cleanup_handle(self):
 65 |         return redis_key.cleanup_key_handle(self.r, self.job_name)
 66 | 
 67 |     def done(self):
 68 |         task_count = self.tasks_handle().get_value()
 69 |         return task_count is None or task_count == 0
 70 | 
 71 |     def periodic(self):
 72 |         periodic_list = self.r.lrange("periodic", 0, -1)
 73 |         return self.job_name.encode("utf-8") in periodic_list
 74 | 
 75 |     def cleanup(self):
 76 |         # hdlr = get_with_key(r, cleanup_key, job_name, lambda bs: json.loads(bs.decode("utf-8")))
 77 |         cleanup_list = self.cleanup_handle().get_value()
 78 |         if cleanup_list is not None:
 79 |             file_list = json.loads(cleanup_list.decode("utf-8"))
 80 |             for f in file_list:
 81 |                 os.remove(f)
 82 | 
 83 |         if self.periodic():
 84 |             self.r.lrem("periodic", 1, self.job_name)
 85 |         else:
 86 |             self.r.lrem("singlepass", 1, self.job_name)
 87 | 
 88 |         self.cleanup_handle().reset()
 89 | 
 90 |     def reset(self):
 91 |         self.count_handle().reset()
 92 |         self.dequeue_handle().reset()
 93 |         self.tasks_handle().reset()
 94 |         self.failures_handle().reset()
 95 |         self.retries_handle().reset()
 96 |         self.start_time_handle().reset()
 97 | 
 98 |     def interrupt(self, cli=True, terminate=True):
 99 |         self.stop_handle().set_value("")
100 |         queued_tasks = list(
101 |             map(lambda x: x.decode("utf-8"), self.count_handle().lrange(0, -1))
102 |         )
103 |         dequeued_tasks = set(
104 |             map(lambda x: x.decode("utf-8"), self.dequeue_handle().lrange(0, -1))
105 |         )
106 | 
107 |         tasks = [item for item in queued_tasks if item not in dequeued_tasks]
108 |         if cli:
109 |             tasks = progressbar.progressbar(tasks, max_value=len(tasks))
110 | 
111 |         # stop active tasks for this job
112 |         for task in tasks:
113 |             app.control.revoke(task, terminate=terminate)
114 | 
115 |         # stop restart job
116 |         app.control.revoke(self.job_name)
117 |         self.stop_handle().reset()
118 | 
119 |     def start_time_handle(self):
120 |         return redis_key.float_redis_key_handle(
121 |             self.r, "irods_ingest_job_start_time", self.job_name
122 |         )
123 | 
124 |     def stop(self):
125 |         add_stopped_job(self.r, self.asdict())
126 |         self.interrupt()
127 |         self.cleanup()
128 |         self.reset()
129 | 
130 |     def stopped(self):
131 |         stopped_jobs_list = get_stopped_jobs_list(self.r)
132 |         for job in stopped_jobs_list:
133 |             if self.job_name == job["job_name"]:
134 |                 return True
135 |         return False
136 | 
137 |     def asdict(self):
138 |         start_time = self.start_time_handle().get_value() or 0
139 |         formatted_start_time = datetime.datetime.fromtimestamp(
140 |             start_time, tz=datetime.timezone.utc
141 |         ).isoformat(timespec="milliseconds")
142 |         elapsed_time = time.time() - start_time if start_time else 0
143 |         elapsed_time_str = str(datetime.timedelta(milliseconds=elapsed_time * 1000))
144 |         tasks = int(self.tasks_handle().get_value() or 0)
145 |         total = self.count_handle().llen()
146 |         failures = int(self.failures_handle().get_value() or 0)
147 |         retries = int(self.retries_handle().get_value() or 0)
148 |         return {
149 |             "job_name": self.job_name,
150 |             "total_tasks": total,
151 |             "remaining_tasks": tasks,
152 |             "failed_tasks": failures,
153 |             "retried_tasks": retries,
154 |             "elapsed_time": elapsed_time_str,
155 |             "start_time": formatted_start_time,
156 |         }
157 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/sync_logging.py:
--------------------------------------------------------------------------------
 1 | import structlog
 2 | import logging
 3 | import logging.handlers
 4 | from structlog import wrap_logger
 5 | import datetime
 6 | import time
 7 | import sys
 8 | 
 9 | irods_sync_logger = "irods_sync"
10 | 
11 | 
12 | def timestamper(logger, log_method, event_dict):
13 |     utc_offset_sec = time.altzone if time.localtime().tm_isdst else time.timezone
14 |     utc_offset = datetime.timedelta(seconds=-utc_offset_sec)
15 |     event_dict["@timestamp"] = (
16 |         datetime.datetime.now()
17 |         .replace(tzinfo=datetime.timezone(offset=utc_offset))
18 |         .isoformat(timespec="milliseconds")
19 |     )
20 |     return event_dict
21 | 
22 | 
23 | logger_map = {}
24 | 
25 | 
26 | def create_sync_logger(logging_config):
27 |     log_file = logging_config["filename"]
28 |     when = logging_config["when"]
29 |     interval = logging_config["interval"]
30 |     level = logging_config["level"]
31 | 
32 |     logger = logging.getLogger(
33 |         irods_sync_logger + "/" + get_sync_logger_key(logging_config)
34 |     )
35 |     logger.propagate = False
36 | 
37 |     # logger = get_task_logger(irods_sync_logger)
38 | 
39 |     if level is not None:
40 |         logger.setLevel(logging.getLevelName(level))
41 | 
42 |     if log_file is not None:
43 |         if when is not None:
44 |             handler = logging.handlers.TimedRotatingFileHandler(
45 |                 log_file, when=when, interval=interval
46 |             )
47 |         else:
48 |             handler = logging.FileHandler(log_file)
49 |     else:
50 |         handler = logging.StreamHandler(sys.stdout)
51 |     logger.addHandler(handler)
52 | 
53 |     return wrap_logger(
54 |         logger,
55 |         processors=[
56 |             structlog.stdlib.filter_by_level,
57 |             structlog.stdlib.add_logger_name,
58 |             structlog.stdlib.add_log_level,
59 |             timestamper,
60 |             structlog.processors.JSONRenderer(),
61 |         ],
62 |     )
63 | 
64 | 
65 | def get_sync_logger(logging_config):
66 |     key = get_sync_logger_key(logging_config)
67 |     logger = logger_map.get(key)
68 |     if logger is None:
69 |         logger = create_sync_logger(logging_config)
70 |         logger_map[key] = logger
71 | 
72 |     return logger
73 | 
74 | 
75 | def get_sync_logger_key(logging_config):
76 |     filename = logging_config["filename"]
77 |     if filename is None:
78 |         filename = ""
79 |     level = logging_config["level"]
80 |     if level is None:
81 |         level = ""
82 |     return filename + "/" + level
83 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/task_queue.py:
--------------------------------------------------------------------------------
 1 | from . import sync_logging
 2 | from .sync_job import sync_job
 3 | from .custom_event_handler import custom_event_handler
 4 | from uuid import uuid1
 5 | 
 6 | 
 7 | class task_queue(object):
 8 |     def __init__(self, name):
 9 |         self.name = name
10 | 
11 |     def name(self):
12 |         return self.name
13 | 
14 |     def add(self, task, meta):
15 |         logger = sync_logging.get_sync_logger(meta["config"]["log"])
16 |         job = sync_job.from_meta(meta)
17 |         if job.stop_handle().get_value() is None:
18 |             logger.info(
19 |                 "incr_job_name",
20 |                 task=meta["task"],
21 |                 path=meta["path"],
22 |                 job_name=job.name(),
23 |             )
24 |             job.tasks_handle().incr()
25 |             task_id = str(uuid1())
26 |             timeout = custom_event_handler(meta).timeout()
27 |             job.count_handle().rpush(task_id)
28 |             task.s(meta).apply_async(
29 |                 queue=self.name(), task_id=task_id, soft_time_limit=timeout
30 |             )
31 |         else:
32 |             # A job by this name is currently being stopped
33 |             logger.info(
34 |                 "async_job_name_stopping",
35 |                 task=meta["task"],
36 |                 path=meta["path"],
37 |                 job_name=job.name(),
38 |             )
39 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/tasks/__init__.py


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/tasks/delete_tasks.py:
--------------------------------------------------------------------------------
  1 | from .. import sync_logging
  2 | from ..celery import app, RestartTask
  3 | from ..custom_event_handler import custom_event_handler
  4 | from ..irods import irods_utils
  5 | from ..utils import enqueue_task
  6 | from .irods_task import IrodsTask
  7 | 
  8 | from irods.exception import (
  9 |     CollectionDoesNotExist,
 10 |     DataObjectDoesNotExist,
 11 |     PycommandsException,
 12 | )
 13 | 
 14 | 
 15 | def schedule_collections_for_removal(meta, list_of_collections_to_delete):
 16 |     if 0 == len(list_of_collections_to_delete):
 17 |         # This could be considered an error, but let's just treat it as a no-op.
 18 |         return
 19 |     meta_for_task = meta.copy()
 20 |     meta_for_task["queue_name"] = meta["path_queue"]
 21 |     meta_for_task["task"] = "delete_collection"
 22 |     for collection in list_of_collections_to_delete:
 23 |         meta_for_task["path"] = collection.path
 24 |         meta_for_task["target_collection"] = collection.path
 25 |         enqueue_task(delete_collection, meta_for_task)
 26 | 
 27 | 
 28 | def schedule_data_objects_for_removal(meta, list_of_objects_to_delete):
 29 |     if 0 == len(list_of_objects_to_delete):
 30 |         # This could be considered an error, but let's just treat it as a no-op.
 31 |         return
 32 |     meta_for_task = meta.copy()
 33 |     meta_for_task["queue_name"] = meta["file_queue"]
 34 |     meta_for_task["task"] = "delete_data_objects"
 35 |     removal_chunk = []
 36 |     chunk_size = meta_for_task.get("files_per_task", 50)
 37 |     for obj in list_of_objects_to_delete:
 38 |         removal_chunk.append(obj.path)
 39 |         if len(removal_chunk) == chunk_size:
 40 |             meta_for_task["data_objects_to_delete"] = removal_chunk
 41 |             enqueue_task(delete_data_objects, meta_for_task)
 42 |             removal_chunk = []
 43 |     if len(removal_chunk) > 0:
 44 |         meta_for_task["data_objects_to_delete"] = removal_chunk
 45 |         enqueue_task(delete_data_objects, meta_for_task)
 46 |         removal_chunk = []
 47 | 
 48 | 
 49 | @app.task(base=RestartTask)
 50 | def delete_collection_task(meta):
 51 |     logical_path = meta["target_collection"]
 52 |     meta_for_task = meta.copy()
 53 |     meta_for_task["queue_name"] = meta["path_queue"]
 54 |     meta_for_task["task"] = "delete_collection"
 55 |     meta_for_task["path"] = logical_path
 56 |     meta_for_task["target_collection"] = logical_path
 57 |     enqueue_task(delete_collection, meta_for_task)
 58 | 
 59 | 
 60 | @app.task(bind=True, base=IrodsTask)
 61 | def delete_collection(self, meta):
 62 |     config = meta["config"]
 63 |     logging_config = config["log"]
 64 |     logger = sync_logging.get_sync_logger(logging_config)
 65 |     event_handler = custom_event_handler(meta)
 66 |     logical_path = meta["target_collection"]
 67 |     session = irods_utils.irods_session(event_handler.get_module(), meta, logger)
 68 |     meta_for_task = meta.copy()
 69 |     meta_for_task["task"] = "delete_collection"
 70 |     try:
 71 |         target_collection = session.collections.get(logical_path)
 72 |     except CollectionDoesNotExist:
 73 |         # Print an error message here because the exception doesn't tell you what doesn't exist.
 74 |         logger.error(f"Collection [{logical_path}] does not exist.")
 75 |         raise
 76 |     if 0 == len(target_collection.data_objects) and 0 == len(
 77 |         target_collection.subcollections
 78 |     ):
 79 |         logger.debug(f"Removing empty collection [{target_collection.path}].")
 80 |         meta_for_task["target"] = target_collection.path
 81 |         irods_utils.delete_collection(event_handler.get_module(), meta_for_task)
 82 |         return
 83 |     if meta.get("only_delete_collection"):
 84 |         logger.info(
 85 |             f"Collection [{logical_path}] could not be removed because it is not empty."
 86 |         )
 87 |         return
 88 |     meta_for_task["delete_empty_parent_collection"] = target_collection.path
 89 |     # The subcollections should be scheduled for removal before the data objects because there could be deep
 90 |     # subcollections with many data objects.
 91 |     schedule_collections_for_removal(meta_for_task, target_collection.subcollections)
 92 |     # This instructs each task which deletes data objects to attempt to remove the parent collection. If this is not
 93 |     # done, the parent collection could remain after everything else has been removed in the parent collection.
 94 |     schedule_data_objects_for_removal(meta_for_task, target_collection.data_objects)
 95 |     # This collection does not schedule itself for removal, nor does it attempt to synchronously remove itself here.
 96 |     # This is because removing the subcollections and data objects are in asynchronous tasks which might take a very
 97 |     # long time to complete. As such, removal of the parent collection has been delegated to those tasks. The last task
 98 |     # to complete should remove the parent collection, whether it's a data object removal or a subcollection removal.
 99 | 
100 | 
101 | @app.task(bind=True, base=IrodsTask)
102 | def delete_data_objects(self, meta):
103 |     config = meta["config"]
104 |     logging_config = config["log"]
105 |     logger = sync_logging.get_sync_logger(logging_config)
106 |     meta_for_task = meta.copy()
107 |     meta_for_task["task"] = "delete_data_object"
108 |     logical_paths = meta_for_task["data_objects_to_delete"]
109 |     if 0 == len(logical_paths):
110 |         logger.warning("No data objects specified for removal - nothing to do.")
111 |         return
112 |     event_handler = custom_event_handler(meta)
113 |     for logical_path in logical_paths:
114 |         try:
115 |             meta_for_task["target"] = logical_path
116 |             irods_utils.delete_data_object(event_handler.get_module(), meta_for_task)
117 |         except DataObjectDoesNotExist:
118 |             logger.error(
119 |                 f"Data object [{logical_path}] does not exist, so it cannot be deleted."
120 |             )
121 |             continue
122 |         except PycommandsException as e:
123 |             logger.error(
124 |                 f"Exception occurred while removing data object [{logical_path}]: {e}"
125 |             )
126 |             continue
127 |     # Synchronously attempt to delete the parent collection. Another task may have already done this depending on the
128 |     # order of completion, or the collection may not be empty yet because there are more things being deleted. The
129 |     # parent collection will be deleted either by a data object removal task or a subcollection removal task.
130 |     parent_collection_path = meta.get("delete_empty_parent_collection")
131 |     if parent_collection_path:
132 |         logger.debug(
133 |             f"Attempting to delete parent collection [{parent_collection_path}]."
134 |         )
135 |         meta_for_delete = meta.copy()
136 |         meta_for_delete["target"] = parent_collection_path
137 |         try:
138 |             irods_utils.delete_collection(event_handler.get_module(), meta_for_delete)
139 |         except CollectionDoesNotExist:
140 |             logger.warning(
141 |                 f"Failed to delete parent collection [{parent_collection_path}]: it no longer exists."
142 |             )
143 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/tasks/irods_task.py:
--------------------------------------------------------------------------------
 1 | from .. import custom_event_handler, sync_logging
 2 | from ..celery import app
 3 | from ..sync_job import sync_job
 4 | 
 5 | import traceback
 6 | 
 7 | 
 8 | class IrodsTask(app.Task):
 9 |     def on_failure(self, exc, task_id, args, kwargs, einfo):
10 |         meta = args[0]
11 |         config = meta["config"]
12 |         job = sync_job.from_meta(meta)
13 |         logger = sync_logging.get_sync_logger(config["log"])
14 |         logger.error(
15 |             "failed_task",
16 |             task=meta["task"],
17 |             path=meta["path"],
18 |             job_name=job.name(),
19 |             task_id=task_id,
20 |             exc=exc,
21 |             einfo=einfo,
22 |             traceback=traceback.extract_tb(exc.__traceback__),
23 |         )
24 |         job.failures_handle().incr()
25 | 
26 |     def on_retry(self, exc, task_id, args, kwargs, einfo):
27 |         meta = args[0]
28 |         config = meta["config"]
29 |         job = sync_job.from_meta(meta)
30 |         logger = sync_logging.get_sync_logger(config["log"])
31 |         logger.warning(
32 |             "retry_task",
33 |             task=meta["task"],
34 |             path=meta["path"],
35 |             job_name=job.name(),
36 |             task_id=task_id,
37 |             exc=exc,
38 |             einfo=einfo,
39 |             traceback=traceback.extract_tb(exc.__traceback__),
40 |         )
41 |         job.retries_handle().incr()
42 | 
43 |     def on_success(self, retval, task_id, args, kwargs):
44 |         meta = args[0]
45 |         config = meta["config"]
46 |         logger = sync_logging.get_sync_logger(config["log"])
47 |         job_name = meta["job_name"]
48 |         logger.info(
49 |             "succeeded_task",
50 |             task=meta["task"],
51 |             path=meta["path"],
52 |             job_name=job_name,
53 |             task_id=task_id,
54 |             retval=retval,
55 |         )
56 | 
57 |     def after_return(self, status, retval, task_id, args, kwargs, einfo):
58 |         meta = args[0]
59 |         config = meta["config"]
60 |         job = sync_job.from_meta(meta)
61 |         logger = sync_logging.get_sync_logger(config["log"])
62 |         logger.info(
63 |             "decr_job_name",
64 |             task=meta["task"],
65 |             path=meta["path"],
66 |             job_name=job.name(),
67 |             task_id=task_id,
68 |             retval=retval,
69 |         )
70 | 
71 |         done = job.tasks_handle().decr() == 0 and not job.periodic()
72 |         if done:
73 |             job.cleanup()
74 | 
75 |         job.dequeue_handle().rpush(task_id)
76 | 
77 |         if done:
78 |             event_handler = custom_event_handler.custom_event_handler(meta)
79 |             if event_handler.hasattr("post_job"):
80 |                 module = event_handler.get_module()
81 |                 module.post_job(module, logger, meta)
82 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/tasks/s3_bucket_tasks.py:
--------------------------------------------------------------------------------
  1 | from .. import sync_logging, utils
  2 | from ..celery import app, RestartTask
  3 | from ..char_map_util import translate_path
  4 | from ..custom_event_handler import custom_event_handler
  5 | from ..irods import s3_bucket
  6 | from ..redis_key import sync_time_key_handle
  7 | from ..redis_utils import get_redis
  8 | from ..sync_job import sync_job
  9 | from ..utils import enqueue_task, is_unicode_encode_error_path
 10 | from .irods_task import IrodsTask
 11 | 
 12 | from billiard import current_process
 13 | from minio import Minio
 14 | 
 15 | import base64
 16 | import datetime
 17 | import os
 18 | import re
 19 | import redis_lock
 20 | import stat
 21 | import time
 22 | import traceback
 23 | 
 24 | 
 25 | @app.task(base=RestartTask)
 26 | def s3_bucket_main_task(meta):
 27 |     # Start periodic job on restart_queue
 28 |     job_name = meta["job_name"]
 29 |     restart_queue = meta["restart_queue"]
 30 |     interval = meta["interval"]
 31 |     if interval is not None:
 32 |         s3_bucket_main_task.s(meta).apply_async(
 33 |             task_id=job_name, queue=restart_queue, countdown=interval
 34 |         )
 35 | 
 36 |     # Continue with singlepass job
 37 |     config = meta["config"]
 38 |     logging_config = config["log"]
 39 |     logger = sync_logging.get_sync_logger(logging_config)
 40 |     try:
 41 |         event_handler = custom_event_handler(meta)
 42 |         if event_handler.hasattr("pre_job"):
 43 |             module = event_handler.get_module()
 44 |             module.pre_job(module, logger, meta)
 45 | 
 46 |         logger.info("***************** restart *****************")
 47 |         job = sync_job.from_meta(meta)
 48 |         if not job.periodic() or job.done():
 49 |             logger.info(
 50 |                 "no tasks for this job and worker handling this task is not busy"
 51 |             )
 52 | 
 53 |             job.reset()
 54 |             job.start_time_handle().set_value(time.time())
 55 |             meta = meta.copy()
 56 |             meta["task"] = "s3_bucket_sync_path"
 57 |             meta["queue_name"] = meta["path_queue"]
 58 |             enqueue_task(s3_bucket_sync_path, meta)
 59 |         else:
 60 |             logger.info("tasks exist for this job or worker handling this task is busy")
 61 | 
 62 |     except OSError as err:
 63 |         logger.warning(
 64 |             "Warning: " + str(err), traceback=traceback.extract_tb(err.__traceback__)
 65 |         )
 66 | 
 67 |     except Exception as err:
 68 |         logger.error(
 69 |             "Unexpected error: " + str(err),
 70 |             traceback=traceback.extract_tb(err.__traceback__),
 71 |         )
 72 |         raise
 73 | 
 74 | 
 75 | @app.task(bind=True, base=IrodsTask)
 76 | def s3_bucket_sync_path(self, meta):
 77 |     path = meta["path"]
 78 |     config = meta["config"]
 79 |     logging_config = config["log"]
 80 | 
 81 |     logger = sync_logging.get_sync_logger(logging_config)
 82 | 
 83 |     event_handler = custom_event_handler(meta)
 84 | 
 85 |     proxy_url = meta.get("s3_proxy_url")
 86 |     if proxy_url is None:
 87 |         httpClient = None
 88 |     else:
 89 |         import urllib3
 90 | 
 91 |         httpClient = urllib3.ProxyManager(
 92 |             proxy_url,
 93 |             timeout=urllib3.Timeout.DEFAULT_TIMEOUT,
 94 |             cert_reqs="CERT_REQUIRED",
 95 |             retries=urllib3.Retry(
 96 |                 total=5, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504]
 97 |             ),
 98 |         )
 99 |     endpoint_domain = meta.get("s3_endpoint_domain")
100 |     s3_access_key = meta.get("s3_access_key")
101 |     s3_secret_key = meta.get("s3_secret_key")
102 |     s3_secure_connection = meta.get("s3_secure_connection", True)
103 |     client = Minio(
104 |         endpoint_domain,
105 |         access_key=s3_access_key,
106 |         secret_key=s3_secret_key,
107 |         secure=s3_secure_connection,
108 |         http_client=httpClient,
109 |     )
110 | 
111 |     try:
112 |         logger.info("walk dir", path=path)
113 |         # TODO: Remove shadowing here - use a different name
114 |         meta = meta.copy()
115 |         meta["task"] = "s3_bucket_sync_dir"
116 |         chunk = {}
117 | 
118 |         # Check to see whether the provided operation and delete_mode are compatible.
119 |         delete_mode = event_handler.delete_mode()
120 |         logger.debug(f"delete_mode: {delete_mode}")
121 |         # TODO(#282): S3 bucket syncs do not support DeleteMode (yet)
122 |         if utils.DeleteMode.DO_NOT_DELETE != delete_mode:
123 |             raise RuntimeError(
124 |                 f"S3 bucket syncs do not support DeleteMode [{delete_mode}]. Only DeleteMode.DO_NOT_DELETE is supported."
125 |             )
126 | 
127 |         path_list = meta["path"].lstrip("/").split("/", 1)
128 |         bucket_name = path_list[0]
129 |         if len(path_list) == 1:
130 |             prefix = ""
131 |         else:
132 |             prefix = path_list[1]
133 |         meta["root"] = bucket_name
134 |         meta["s3_prefix"] = prefix
135 |         itr = client.list_objects(bucket_name, prefix=prefix, recursive=True)
136 | 
137 |         if meta["profile"]:
138 |             profile_log = config.get("profile")
139 |             profile_logger = sync_logging.get_sync_logger(profile_log)
140 |             task_id = self.request.id
141 | 
142 |             profile_logger.info(
143 |                 "list_dir_prerun",
144 |                 event_id=task_id + ":list_dir",
145 |                 event_name="list_dir",
146 |                 hostname=self.request.hostname,
147 |                 index=current_process().index,
148 |             )
149 |             itr = list(itr)
150 |             if meta["profile"]:
151 |                 profile_logger.info(
152 |                     "list_dir_postrun",
153 |                     event_id=task_id + ":list_dir",
154 |                     event_name="list_dir",
155 |                     hostname=self.request.hostname,
156 |                     index=current_process().index,
157 |                 )
158 | 
159 |         exclude_file_name = meta["exclude_file_name"]
160 |         exclude_directory_name = meta["exclude_directory_name"]
161 |         file_regex = [re.compile(r) for r in exclude_file_name]
162 |         dir_regex = [re.compile(r) for r in exclude_directory_name]
163 | 
164 |         for obj in itr:
165 |             obj_stats = {}
166 | 
167 |             full_path = obj.object_name
168 |             full_path = obj.object_name
169 | 
170 |             if obj.object_name.endswith("/"):
171 |                 # TODO: Not sure what this means -- skip it?
172 |                 # chunk[full_path] = {}
173 |                 continue
174 | 
175 |             # add object stat dict to the chunk dict
176 |             obj_stats = {
177 |                 "is_link": False,
178 |                 "is_socket": False,
179 |                 "mtime": obj.last_modified.timestamp(),
180 |                 "ctime": obj.last_modified.timestamp(),
181 |                 "size": obj.size,
182 |             }
183 |             chunk[full_path] = obj_stats
184 | 
185 |             # Launch async job when enough objects are ready to be sync'd
186 |             files_per_task = meta.get("files_per_task")
187 |             if len(chunk) >= files_per_task:
188 |                 sync_files_meta = meta.copy()
189 |                 sync_files_meta["chunk"] = chunk
190 |                 sync_files_meta["queue_name"] = meta["file_queue"]
191 |                 enqueue_task(s3_bucket_sync_files, sync_files_meta)
192 |                 chunk.clear()
193 | 
194 |         if len(chunk) > 0:
195 |             sync_files_meta = meta.copy()
196 |             sync_files_meta["chunk"] = chunk
197 |             sync_files_meta["queue_name"] = meta["file_queue"]
198 |             enqueue_task(s3_bucket_sync_files, sync_files_meta)
199 |             chunk.clear()
200 | 
201 |     except Exception as err:
202 |         event_handler = custom_event_handler(meta)
203 |         retry_countdown = event_handler.delay(self.request.retries + 1)
204 |         max_retries = event_handler.max_retries()
205 |         raise self.retry(max_retries=max_retries, exc=err, countdown=retry_countdown)
206 | 
207 | 
208 | @app.task(bind=True, base=IrodsTask)
209 | def s3_bucket_sync_dir(self, meta_input):
210 |     meta = meta_input.copy()
211 |     meta["entry_type"] = "dir"
212 |     s3_bucket_sync_entry(
213 |         self, meta, s3_bucket.sync_data_from_dir, s3_bucket.sync_metadata_from_dir
214 |     )
215 | 
216 | 
217 | @app.task(bind=True, base=IrodsTask)
218 | def s3_bucket_sync_files(self, meta_input):
219 |     meta = meta_input.copy()
220 |     meta["entry_type"] = "file"
221 |     meta["task"] = "sync_file"
222 |     for path, obj_stats in meta["chunk"].items():
223 |         meta["path"] = path
224 |         meta["is_empty_dir"] = obj_stats.get("is_empty_dir")
225 |         meta["is_link"] = obj_stats.get("is_link")
226 |         meta["is_socket"] = obj_stats.get("is_socket")
227 |         meta["mtime"] = obj_stats.get("mtime")
228 |         meta["ctime"] = obj_stats.get("ctime")
229 |         meta["size"] = obj_stats.get("size")
230 |         s3_bucket_sync_entry(
231 |             self, meta, s3_bucket.sync_data_from_file, s3_bucket.sync_metadata_from_file
232 |         )
233 | 
234 | 
235 | def s3_bucket_sync_entry(self, meta_input, datafunc, metafunc):
236 |     meta = meta_input.copy()
237 | 
238 |     path = meta["path"]
239 |     target = meta["target"]
240 |     config = meta["config"]
241 |     logging_config = config["log"]
242 |     logger = sync_logging.get_sync_logger(logging_config)
243 | 
244 |     entry_type = meta["entry_type"]
245 | 
246 |     event_handler = custom_event_handler(meta)
247 |     max_retries = event_handler.max_retries()
248 | 
249 |     lock = None
250 | 
251 |     logger.info("synchronizing " + entry_type + ". path = " + path)
252 | 
253 |     character_map = getattr(event_handler.get_module(), "character_map", None)
254 |     path_requires_UnicodeEncodeError_handling = is_unicode_encode_error_path(path)
255 | 
256 |     # TODO: Pull out this logic into some functions
257 |     if path_requires_UnicodeEncodeError_handling or character_map is not None:
258 |         abspath = os.path.abspath(path)
259 |         utf8_escaped_abspath = abspath.encode("utf8", "surrogateescape")
260 |         b64_path_str = base64.b64encode(utf8_escaped_abspath)
261 | 
262 |     if path_requires_UnicodeEncodeError_handling:
263 |         path = os.path.dirname(abspath)
264 |         unicode_error_filename = "irods_UnicodeEncodeError_" + str(
265 |             b64_path_str.decode("utf8")
266 |         )
267 |         logger.warning(
268 |             "s3_bucket_sync_entry raised UnicodeEncodeError while syncing path:"
269 |             + str(utf8_escaped_abspath)
270 |         )
271 |         meta["path"] = path
272 |         meta["b64_path_str"] = b64_path_str
273 |         meta["b64_reason"] = "UnicodeEncodeError"
274 |         meta["unicode_error_filename"] = unicode_error_filename
275 |         sync_key = str(b64_path_str.decode("utf8")) + ":" + target
276 |     else:
277 |         sync_key = path + ":" + target
278 | 
279 |     try:
280 |         r = get_redis(config)
281 |         lock = redis_lock.Lock(r, "sync_" + entry_type + ":" + sync_key)
282 |         lock.acquire()
283 | 
284 |         sync_time_handle = sync_time_key_handle(r, sync_key)
285 |         ignore_redis_cache = meta.get("ignore_cache", False)
286 |         sync_time = None if ignore_redis_cache else sync_time_handle.get_value()
287 | 
288 |         mtime = meta["mtime"]
289 |         ctime = meta["ctime"]
290 | 
291 |         if sync_time is not None and mtime < sync_time and ctime < sync_time:
292 |             logger.info(
293 |                 "succeeded_" + entry_type + "_has_not_changed",
294 |                 task=meta["task"],
295 |                 path=path,
296 |             )
297 |             return
298 | 
299 |         t = datetime.datetime.now().timestamp()
300 |         logger.info(
301 |             "synchronizing " + entry_type, path=path, t0=sync_time, t=t, ctime=ctime
302 |         )
303 |         meta2 = meta.copy()
304 |         if path == meta["root"]:
305 |             if path_requires_UnicodeEncodeError_handling:
306 |                 # TODO(#250): This may not work on Windows...
307 |                 target2 = os.path.join(target, meta["unicode_error_filename"])
308 |             else:
309 |                 target2 = target
310 |         else:
311 |             # Strip prefix from S3 path
312 |             prefix = meta["s3_prefix"]
313 |             reg_path = path[path.index(prefix) + len(prefix) :].strip("/")
314 |             # Construct S3 "logical path"
315 |             target2 = "/".join([meta["target"], reg_path])
316 |             # Construct S3 "physical path" as: /bucket/objectname
317 |             meta2["path"] = f"/{meta['root']}/{path}"
318 | 
319 |         # If the event handler has a character_map function, it should have returned a
320 |         # structure (either a dict or a list/tuple of key-value tuples) to be used for
321 |         # instantiating a collections.OrderedDict object. This object will dictate how
322 |         # the logical path's characters are remapped.  The re-mapping is performed
323 |         # independently for each path element of the collection hierarchy.
324 | 
325 |         if not path_requires_UnicodeEncodeError_handling and character_map is not None:
326 |             translated_path = translate_path(target2, character_map())
327 |             # arrange for AVU to be attached only when logical name changes
328 |             if translated_path != target2:
329 |                 target2 = translated_path
330 |                 meta2["b64_reason"] = "character_map"
331 |                 meta2["b64_path_str_charmap"] = b64_path_str
332 | 
333 |         meta2["target"] = target2
334 | 
335 |         if sync_time is None or mtime >= sync_time:
336 |             datafunc(event_handler.get_module(), meta2, logger, True)
337 |             logger.info("succeeded", task=meta["task"], path=path)
338 |         else:
339 |             metafunc(event_handler.get_module(), meta2, logger)
340 |             logger.info("succeeded_metadata_only", task=meta["task"], path=path)
341 |         sync_time_handle.set_value(str(t))
342 |     except Exception as err:
343 |         event_handler = custom_event_handler(meta)
344 |         retry_countdown = event_handler.delay(self.request.retries + 1)
345 |         raise self.retry(max_retries=max_retries, exc=err, countdown=retry_countdown)
346 |     finally:
347 |         if lock is not None:
348 |             lock.release()
349 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/irods/irods_capability_automated_ingest/da87d89acab7136ac5610195b0cc5e5abd983795/irods_capability_automated_ingest/test/__init__.py


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/test/test_lib.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from irods.session import iRODSSession
 4 | 
 5 | from irods_capability_automated_ingest.redis_utils import get_redis
 6 | 
 7 | # This is a global in order to take advantage of "caching" the Redis configuration.
 8 | # Modify get_redis_config if changes are needed.
 9 | redis_config = {}
10 | 
11 | 
12 | # TODO(#286): Derive from the environment?
13 | def get_redis_config(host="redis", port=6379, db=0):
14 |     global redis_config
15 |     if redis_config:
16 |         return redis_config
17 |     redis_config = {"redis": {"host": host, "port": port, "db": db}}
18 |     return redis_config
19 | 
20 | 
21 | def clear_redis():
22 |     get_redis(get_redis_config()).flushdb()
23 | 
24 | 
25 | def get_test_irods_client_environment_dict():
26 |     # TODO(#286): Derive from the environment?
27 |     return {
28 |         "host": os.environ.get("IRODS_HOST"),
29 |         "port": os.environ.get("IRODS_PORT"),
30 |         "user": os.environ.get("IRODS_USER_NAME"),
31 |         "zone": os.environ.get("IRODS_ZONE_NAME"),
32 |         "password": os.environ.get("IRODS_PASSWORD"),
33 |     }
34 | 
35 | 
36 | def irmtrash():
37 |     # TODO(irods/python-irodsclient#182): Needs irmtrash endpoint
38 |     with iRODSSession(**get_test_irods_client_environment_dict()) as session:
39 |         rods_trash_path = "/".join(
40 |             ["", session.zone, "trash", "home", session.username]
41 |         )
42 |         rods_trash_coll = session.collections.get(rods_trash_path)
43 |         for coll in rods_trash_coll.subcollections:
44 |             delete_collection_if_exists(coll.path, recurse=True, force=True)
45 | 
46 | 
47 | def delete_collection_if_exists(coll, recurse=True, force=False):
48 |     with iRODSSession(**get_test_irods_client_environment_dict()) as session:
49 |         if session.collections.exists(coll):
50 |             session.collections.remove(coll, recurse=recurse, force=force)
51 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/test/test_s3_bucket_scan.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import io
  4 | import os
  5 | import signal
  6 | import shutil
  7 | import subprocess
  8 | import tempfile
  9 | import textwrap
 10 | import time
 11 | 
 12 | from irods.data_object import irods_dirname, irods_basename
 13 | from irods.exception import CollectionDoesNotExist
 14 | from irods.meta import iRODSMeta
 15 | from irods.models import Collection, DataObject
 16 | from irods.session import iRODSSession
 17 | 
 18 | from irods_capability_automated_ingest.celery import app
 19 | from irods_capability_automated_ingest.redis_utils import get_redis
 20 | from irods_capability_automated_ingest.sync_job import sync_job
 21 | from irods_capability_automated_ingest.utils import DeleteMode, Operation
 22 | import irods_capability_automated_ingest.examples
 23 | 
 24 | from minio import Minio
 25 | 
 26 | from . import test_lib
 27 | 
 28 | # TODO(#286): Derive from the environment?
 29 | # This must be set as an environment variable in order for the Celery workers to communicate with the broker.
 30 | # Update this value if the hostname, port, or database for the Redis service needs to change.
 31 | os.environ["CELERY_BROKER_URL"] = "redis://redis:6379/0"
 32 | 
 33 | 
 34 | def start_workers(n=2, args=[]):
 35 |     if not args:
 36 |         args = ["-l", "info", "-Q", "restart,path,file"]
 37 |     workers = subprocess.Popen(
 38 |         [
 39 |             "celery",
 40 |             "-A",
 41 |             "irods_capability_automated_ingest",
 42 |             "worker",
 43 |             "-c",
 44 |             str(n),
 45 |             # This option is needed because the worker coordination takes too long for running the tests between
 46 |             # standing up the workers and their being ready to execute tasks.
 47 |             "--without-mingle",
 48 |         ]
 49 |         + args
 50 |     )
 51 |     return workers
 52 | 
 53 | 
 54 | def wait_for_job_to_finish(workers, job_name, timeout=60):
 55 |     r = get_redis(test_lib.get_redis_config())
 56 |     t0 = time.time()
 57 |     while timeout is None or time.time() - t0 < timeout:
 58 |         restart = r.llen("restart")
 59 |         i = app.control.inspect()
 60 |         act = i.active()
 61 |         if act is None:
 62 |             active = 0
 63 |         else:
 64 |             active = sum(map(len, act.values()))
 65 |         job_done = sync_job(job_name, r).done()
 66 |         if restart != 0 or active != 0 or not job_done:
 67 |             time.sleep(1)
 68 |         else:
 69 |             return
 70 |     # If we escape the loop, that means the job timed out.
 71 |     raise TimeoutError(
 72 |         f"Timed out after [{timeout}] seconds waiting for job [{job_name}] to complete."
 73 |     )
 74 | 
 75 | 
 76 | class test_s3_sync_operations(unittest.TestCase):
 77 |     @classmethod
 78 |     def setUpClass(cls):
 79 |         cls.restart_queue_name = "s3_sync_restart"
 80 |         cls.path_queue_name = "s3_sync_path"
 81 |         cls.file_queue_name = "s3_sync_file"
 82 |         test_lib.clear_redis()
 83 |         test_lib.irmtrash()
 84 |         cls.workers = start_workers(
 85 |             args=[
 86 |                 "-l",
 87 |                 "info",
 88 |                 "-Q",
 89 |                 f"{cls.restart_queue_name},{cls.path_queue_name},{cls.file_queue_name}",
 90 |             ]
 91 |         )
 92 |         cls.irods_session = iRODSSession(
 93 |             **test_lib.get_test_irods_client_environment_dict()
 94 |         )
 95 |         cls.job_name = "test_s3_sync_job"
 96 |         # TODO(#286): Derive this from the environment...
 97 |         cls.s3_endpoint_domain = "minio:19000"
 98 |         # TODO(#286): Derive these from the environment...
 99 |         cls.s3_access_key = "irods"
100 |         cls.s3_secret_key = "irodsadmin"
101 |         f = tempfile.NamedTemporaryFile("w+t", delete=False)
102 |         # TODO(#264): This will not work on Windows...
103 |         f.write(f"{cls.s3_access_key}\n{cls.s3_secret_key}")
104 |         f.close()
105 |         cls.s3_keypair_path = f.name
106 |         # Establish a connection with Minio that persists for every test
107 |         cls.minio_client = Minio(
108 |             cls.s3_endpoint_domain,
109 |             access_key=cls.s3_access_key,
110 |             secret_key=cls.s3_secret_key,
111 |             secure=False,
112 |         )
113 |         cls.bucket_name = "test-s3-put-sync-operation-bucket"
114 |         cls.source_path = f"/{cls.bucket_name}"
115 |         cls.minio_client.make_bucket(cls.bucket_name)
116 |         cls.objects_list = {
117 |             "/".join(["shallow_subfolder", "shallow_object.txt"]),
118 |             "/".join(["deep_subfolder", "a", "b", "c", "object_c.txt"]),
119 |             "/".join(["deep_subfolder", "x", "y", "z", "object_z.txt"]),
120 |             "/".join(["top_level_object.txt"]),
121 |         }
122 | 
123 |     @classmethod
124 |     def tearDownClass(cls):
125 |         test_lib.clear_redis()
126 |         test_lib.irmtrash()
127 |         cls.irods_session.cleanup()
128 |         cls.workers.send_signal(signal.SIGINT)
129 |         cls.workers.wait()
130 |         cls.minio_client.remove_bucket(cls.bucket_name)
131 | 
132 |     def create_objects(self, objects_list):
133 |         for obj in objects_list:
134 |             # The prefix is everything between the bucket name and the "basename" of the object "path".
135 |             self.minio_client.put_object(
136 |                 self.bucket_name, obj, data=io.BytesIO(obj.encode()), length=len(obj)
137 |             )
138 | 
139 |     def setUp(self):
140 |         self.create_objects(self.objects_list)
141 |         self.destination_collection = "/".join(
142 |             [
143 |                 "",
144 |                 self.irods_session.zone,
145 |                 "home",
146 |                 self.irods_session.username,
147 |                 "s3_sync_collection",
148 |             ]
149 |         )
150 | 
151 |     def tearDown(self):
152 |         objects = list(self.minio_client.list_objects(self.bucket_name, recursive=True))
153 |         for obj in objects:
154 |             self.minio_client.remove_object(self.bucket_name, obj.object_name)
155 |         test_lib.delete_collection_if_exists(
156 |             self.destination_collection, recurse=True, force=True
157 |         )
158 | 
159 |     @staticmethod
160 |     def get_event_handler(operation):
161 |         operation_strings = {
162 |             Operation.NO_OP: "NO_OP",
163 |             Operation.REGISTER_SYNC: "REGISTER_SYNC",
164 |             Operation.REGISTER_AS_REPLICA_SYNC: "REGISTER_AS_REPLICA_SYNC",
165 |             Operation.PUT: "PUT",
166 |             Operation.PUT_SYNC: "PUT_SYNC",
167 |             Operation.PUT_APPEND: "PUT_APPEND",
168 |         }
169 |         return textwrap.dedent(
170 |             f"""
171 |             from irods_capability_automated_ingest.core import Core
172 |             from irods_capability_automated_ingest.utils import DeleteMode, Operation
173 |             class event_handler(Core):
174 |                 @staticmethod
175 |                 def operation(session, meta, **options):
176 |                     return Operation.{operation_strings[operation]}
177 |             """
178 |         )
179 | 
180 |     def run_sync(
181 |         self,
182 |         source_path,
183 |         destination_collection,
184 |         event_handler_path,
185 |         job_name=None,
186 |         ignore_cache=False,
187 |         files_per_task=1,
188 |         log_level=None,
189 |         queue_names=tuple(),
190 |         expected_failure_count=None,
191 |     ):
192 |         sync_script = "irods_capability_automated_ingest.irods_sync"
193 |         # Construct an invocation of the sync script with various options.
194 |         command = [
195 |             "python",
196 |             "-m",
197 |             sync_script,
198 |             "start",
199 |             source_path,
200 |             destination_collection,
201 |             "--event_handler",
202 |             event_handler_path,
203 |             "--files_per_task",
204 |             str(files_per_task),
205 |             "--s3_keypair",
206 |             self.s3_keypair_path,
207 |             "--s3_endpoint_domain",
208 |             self.s3_endpoint_domain,
209 |             "--s3_insecure_connection",
210 |         ]
211 |         if ignore_cache:
212 |             command.append("--ignore_cache")
213 |         if log_level:
214 |             command.extend(["--log_level", log_level])
215 |         # The test workers watch non-default queue names so that no other Celery workers which happen to be watching
216 |         # the same Redis database will pick up the work.
217 |         if not queue_names:
218 |             queue_names = tuple(
219 |                 [self.restart_queue_name, self.path_queue_name, self.file_queue_name]
220 |             )
221 |         command.extend(["--restart_queue", queue_names[0]])
222 |         command.extend(["--path_queue", queue_names[1]])
223 |         command.extend(["--file_queue", queue_names[2]])
224 |         # job_name is required so that we can track the sync job and its failed tasks even after it has completed.
225 |         if not job_name:
226 |             job_name = self.job_name
227 |         command.extend(["--job_name", job_name])
228 |         # Now, schedule the job...
229 |         proc = subprocess.Popen(command)
230 |         proc.wait()
231 |         # ...and then wait for the workers to complete the tasks.
232 |         try:
233 |             wait_for_job_to_finish(self.workers, job_name)
234 |         except TimeoutError as e:
235 |             self.fail(e)
236 |         # Assert that the expected number of failed tasks for this job are found. A value of None means no tasks
237 |         # failed for this job.
238 |         self.assertEqual(
239 |             sync_job(job_name, get_redis(test_lib.get_redis_config()))
240 |             .failures_handle()
241 |             .get_value(),
242 |             expected_failure_count,
243 |         )
244 | 
245 |     def assert_ingested_contents_exist_in_irods(self):
246 |         for obj in self.objects_list:
247 |             self.assertTrue(
248 |                 self.irods_session.data_objects.exists(
249 |                     "/".join([self.destination_collection, obj])
250 |                 )
251 |             )
252 | 
253 |     def test_s3_with_put(self):
254 |         operation = Operation.PUT
255 |         new_object_name = "test_s3_with_put"
256 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
257 |         with tempfile.NamedTemporaryFile() as tf:
258 |             event_handler_path = tf.name
259 |             with open(event_handler_path, "w") as f:
260 |                 f.write(event_handler_contents)
261 |             # Run the first sync and confirm that everything was ingested properly.
262 |             self.run_sync(
263 |                 self.source_path, self.destination_collection, event_handler_path
264 |             )
265 |             self.assert_ingested_contents_exist_in_irods()
266 |             try:
267 |                 self.minio_client.put_object(
268 |                     self.bucket_name,
269 |                     new_object_name,
270 |                     data=io.BytesIO(new_object_name.encode()),
271 |                     length=len(new_object_name),
272 |                 )
273 |                 self.run_sync(
274 |                     self.source_path, self.destination_collection, event_handler_path
275 |                 )
276 |                 self.assert_ingested_contents_exist_in_irods()
277 |                 self.assertTrue(
278 |                     self.irods_session.data_objects.exists(
279 |                         "/".join([self.destination_collection, new_object_name])
280 |                     )
281 |                 )
282 |             finally:
283 |                 self.minio_client.remove_object(self.bucket_name, new_object_name)
284 | 
285 |     def test_s3_with_put_sync(self):
286 |         operation = Operation.PUT_SYNC
287 |         new_object_name = "test_s3_with_put_sync"
288 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
289 |         with tempfile.NamedTemporaryFile() as tf:
290 |             event_handler_path = tf.name
291 |             with open(event_handler_path, "w") as f:
292 |                 f.write(event_handler_contents)
293 |             # Run the first sync and confirm that everything was ingested properly.
294 |             self.run_sync(
295 |                 self.source_path, self.destination_collection, event_handler_path
296 |             )
297 |             self.assert_ingested_contents_exist_in_irods()
298 |             try:
299 |                 self.minio_client.put_object(
300 |                     self.bucket_name,
301 |                     new_object_name,
302 |                     data=io.BytesIO(new_object_name.encode()),
303 |                     length=len(new_object_name),
304 |                 )
305 |                 self.run_sync(
306 |                     self.source_path, self.destination_collection, event_handler_path
307 |                 )
308 |                 self.assert_ingested_contents_exist_in_irods()
309 |                 self.assertTrue(
310 |                     self.irods_session.data_objects.exists(
311 |                         "/".join([self.destination_collection, new_object_name])
312 |                     )
313 |                 )
314 |             finally:
315 |                 self.minio_client.remove_object(self.bucket_name, new_object_name)
316 | 
317 |     def test_s3_with_put_append(self):
318 |         operation = Operation.PUT_SYNC
319 |         new_object_name = "test_s3_with_put_append"
320 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
321 |         with tempfile.NamedTemporaryFile() as tf:
322 |             event_handler_path = tf.name
323 |             with open(event_handler_path, "w") as f:
324 |                 f.write(event_handler_contents)
325 |             # Run the first sync and confirm that everything was ingested properly.
326 |             self.run_sync(
327 |                 self.source_path, self.destination_collection, event_handler_path
328 |             )
329 |             self.assert_ingested_contents_exist_in_irods()
330 |             try:
331 |                 self.minio_client.put_object(
332 |                     self.bucket_name,
333 |                     new_object_name,
334 |                     data=io.BytesIO(new_object_name.encode()),
335 |                     length=len(new_object_name),
336 |                 )
337 |                 self.run_sync(
338 |                     self.source_path, self.destination_collection, event_handler_path
339 |                 )
340 |                 self.assert_ingested_contents_exist_in_irods()
341 |                 self.assertTrue(
342 |                     self.irods_session.data_objects.exists(
343 |                         "/".join([self.destination_collection, new_object_name])
344 |                     )
345 |                 )
346 |             finally:
347 |                 self.minio_client.remove_object(self.bucket_name, new_object_name)
348 | 
349 |     def test_s3_with_register_sync(self):
350 |         operation = Operation.REGISTER_SYNC
351 |         new_object_name = "test_s3_with_register_sync"
352 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
353 |         with tempfile.NamedTemporaryFile() as tf:
354 |             event_handler_path = tf.name
355 |             with open(event_handler_path, "w") as f:
356 |                 f.write(event_handler_contents)
357 |             # Run the first sync and confirm that everything was ingested properly.
358 |             self.run_sync(
359 |                 self.source_path, self.destination_collection, event_handler_path
360 |             )
361 |             self.assert_ingested_contents_exist_in_irods()
362 |             try:
363 |                 self.minio_client.put_object(
364 |                     self.bucket_name,
365 |                     new_object_name,
366 |                     data=io.BytesIO(new_object_name.encode()),
367 |                     length=len(new_object_name),
368 |                 )
369 |                 self.run_sync(
370 |                     self.source_path, self.destination_collection, event_handler_path
371 |                 )
372 |                 self.assert_ingested_contents_exist_in_irods()
373 |                 self.assertTrue(
374 |                     self.irods_session.data_objects.exists(
375 |                         "/".join([self.destination_collection, new_object_name])
376 |                     )
377 |                 )
378 |             finally:
379 |                 self.minio_client.remove_object(self.bucket_name, new_object_name)
380 | 
381 |     def test_s3_with_register_as_replica_sync(self):
382 |         operation = Operation.REGISTER_AS_REPLICA_SYNC
383 |         new_object_name = "test_s3_with_register_as_replica_sync"
384 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
385 |         with tempfile.NamedTemporaryFile() as tf:
386 |             event_handler_path = tf.name
387 |             with open(event_handler_path, "w") as f:
388 |                 f.write(event_handler_contents)
389 |             # Run the first sync and confirm that everything was ingested properly.
390 |             self.run_sync(
391 |                 self.source_path, self.destination_collection, event_handler_path
392 |             )
393 |             self.assert_ingested_contents_exist_in_irods()
394 |             try:
395 |                 self.minio_client.put_object(
396 |                     self.bucket_name,
397 |                     new_object_name,
398 |                     data=io.BytesIO(new_object_name.encode()),
399 |                     length=len(new_object_name),
400 |                 )
401 |                 self.run_sync(
402 |                     self.source_path, self.destination_collection, event_handler_path
403 |                 )
404 |                 self.assert_ingested_contents_exist_in_irods()
405 |                 self.assertTrue(
406 |                     self.irods_session.data_objects.exists(
407 |                         "/".join([self.destination_collection, new_object_name])
408 |                     )
409 |                 )
410 |             finally:
411 |                 self.minio_client.remove_object(self.bucket_name, new_object_name)
412 | 
413 |     def test_register_to_deep_nonexistent_subcollection_does_not_hang_forever__issue_124(
414 |         self,
415 |     ):
416 |         operation = Operation.REGISTER_SYNC
417 |         event_handler_contents = test_s3_sync_operations.get_event_handler(operation)
418 |         # The destination collection needs to have enough path elements to exceed the number of path elements in
419 |         # the "path" to the S3 object.
420 |         nested_destination_collection = "/".join(
421 |             [self.destination_collection, "a", "b", "c", "d", "e"]
422 |         )
423 |         with tempfile.NamedTemporaryFile() as tf:
424 |             event_handler_path = tf.name
425 |             with open(event_handler_path, "w") as f:
426 |                 f.write(event_handler_contents)
427 |             # Run the first sync and confirm that everything was ingested properly.
428 |             self.run_sync(
429 |                 self.source_path, nested_destination_collection, event_handler_path
430 |             )
431 |             for obj in self.objects_list:
432 |                 self.assertTrue(
433 |                     self.irods_session.data_objects.exists(
434 |                         "/".join([nested_destination_collection, obj])
435 |                     )
436 |                 )
437 | 
438 | 
439 | def main():
440 |     unittest.main()
441 | 
442 | 
443 | if __name__ == "__main__":
444 |     main()
445 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/utils.py:
--------------------------------------------------------------------------------
 1 | from . import sync_logging
 2 | from .sync_job import sync_job
 3 | from .custom_event_handler import custom_event_handler
 4 | from uuid import uuid1
 5 | 
 6 | from enum import Enum
 7 | import os
 8 | import stat
 9 | 
10 | 
11 | class Operation(Enum):
12 |     REGISTER_SYNC = 0
13 |     REGISTER_AS_REPLICA_SYNC = 1
14 |     PUT = 2
15 |     PUT_SYNC = 3
16 |     PUT_APPEND = 4
17 |     NO_OP = 5
18 | 
19 | 
20 | class DeleteMode(Enum):
21 |     DO_NOT_DELETE = 0
22 |     UNREGISTER = 1
23 |     TRASH = 2
24 |     NO_TRASH = 3
25 | 
26 | 
27 | def delete_mode_is_compatible_with_operation(delete_mode, operation):
28 |     operation_to_acceptable_delete_modes = {
29 |         Operation.NO_OP: [
30 |             DeleteMode.DO_NOT_DELETE,
31 |         ],
32 |         Operation.REGISTER_SYNC: [
33 |             DeleteMode.DO_NOT_DELETE,
34 |             DeleteMode.UNREGISTER,
35 |         ],
36 |         Operation.REGISTER_AS_REPLICA_SYNC: [
37 |             DeleteMode.DO_NOT_DELETE,
38 |             DeleteMode.UNREGISTER,
39 |         ],
40 |         Operation.PUT: [
41 |             DeleteMode.DO_NOT_DELETE,
42 |         ],
43 |         Operation.PUT_SYNC: [
44 |             DeleteMode.DO_NOT_DELETE,
45 |             DeleteMode.TRASH,
46 |             DeleteMode.NO_TRASH,
47 |         ],
48 |         Operation.PUT_APPEND: [
49 |             DeleteMode.DO_NOT_DELETE,
50 |             DeleteMode.TRASH,
51 |             DeleteMode.NO_TRASH,
52 |         ],
53 |     }
54 |     return delete_mode in operation_to_acceptable_delete_modes.get(operation, [])
55 | 
56 | 
57 | def enqueue_task(task, meta):
58 |     logger = sync_logging.get_sync_logger(meta["config"]["log"])
59 |     job = sync_job.from_meta(meta)
60 |     if job.stop_handle().get_value() is None:
61 |         logger.info(
62 |             "incr_job_name", task=meta["task"], path=meta["path"], job_name=job.name()
63 |         )
64 |         job.tasks_handle().incr()
65 |         task_id = str(uuid1())
66 |         timeout = custom_event_handler(meta).timeout()
67 |         job.count_handle().rpush(task_id)
68 |         task.s(meta).apply_async(
69 |             queue=meta["queue_name"], task_id=task_id, soft_time_limit=timeout
70 |         )
71 |     else:
72 |         # A job by this name is currently being stopped
73 |         logger.info(
74 |             "async_job_name_stopping",
75 |             task=meta["task"],
76 |             path=meta["path"],
77 |             job_name=job.name(),
78 |         )
79 | 
80 | 
81 | # Attempt to encode full physical path on local filesystem
82 | # Special handling required for non-encodable strings which raise UnicodeEncodeError
83 | def is_unicode_encode_error_path(path):
84 |     try:
85 |         _ = path.encode("utf8")
86 |     except UnicodeEncodeError:
87 |         return True
88 |     return False
89 | 


--------------------------------------------------------------------------------
/irods_capability_automated_ingest/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.6.0"
2 | 


--------------------------------------------------------------------------------
/profile/README.md:
--------------------------------------------------------------------------------
 1 | === log profile ===
 2 | 
 3 | `--profile` `--profile_filename <profile filename>` `--profile_level INFO`
 4 | 
 5 | === elasticsearch ===
 6 | 
 7 | `config/elasticsearch.yml`
 8 | 
 9 | add
10 | 
11 | ```
12 | http.cors.enabled: true
13 | http.cors.allow-origin: "*"
14 | ```
15 | 
16 | === ingest ===
17 | 
18 | ```
19 | pip install elasticsearch
20 | ```
21 | 
22 | ```
23 | python profile.py <profile filename> <index> [ --elasticsearch_host <elasticsearch host> ] [ --additional_key <additional properties> ]
24 | ```
25 | 
26 | === visualize ===
27 | 
28 | firefox profile.html
29 | 
30 | 	
31 | 
32 | 


--------------------------------------------------------------------------------
/profile/profile.css:
--------------------------------------------------------------------------------
 1 | .vis-item.sync_file {
 2 |     background-color: #ff8888;
 3 |     border-color: red;
 4 | }
 5 | 
 6 | 
 7 | .vis-item.sync_dir {
 8 |     background-color: #88ff88;
 9 |     border-color: green;
10 | }
11 | 
12 | .vis-item.sync_path {
13 |     background-color: #8888ff;
14 |     border-color: blue;
15 | }
16 | 
17 | .vis-item.restart {
18 |     background-color: grey;
19 |     border-color: yellow;
20 | }
21 | 
22 | .vis-item.list_dir {
23 |     background-color: #ff8800;
24 |     border-color: orange;
25 | }


--------------------------------------------------------------------------------
/profile/profile.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 | 
 4 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
 5 |     <script src="http://cdnjs.cloudflare.com/ajax/libs/moment.js/2.8.4/moment.min.js"></script>
 6 |     <script src="vis.js"></script>
 7 |     <link href="vis.css" rel="stylesheet" type="text/css"/>
 8 |     <script type="text/javascript" src="profile.js"></script>
 9 |     <link href="profile.css" rel="stylesheet" type="text/css"/>
10 |   </head>
11 |   <body>
12 |     index: <input id="index" type="text"/>
13 |     <br/>
14 |     min:<span id="minDate"></span>
15 |     <button type="button" onclick="setStart(getMin())">fill</button>
16 |     max:<span id="maxDate"></span>
17 |     <button type="button" onclick="getMinAndMaxDate()">get min and max date</button>
18 |     <br/>
19 |     start: <input id="start" type="text"/>
20 |     duration (ms): <input id="duration" type="text"/>
21 |     #events:<span id="numEvents"></span>
22 |     <button type="button" onclick="drawChart()">refresh</button>
23 |     <div id="visualization"></div>
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/profile/profile.js:
--------------------------------------------------------------------------------
  1 | function drawChart(){
  2 |     let startDate = new Date(document.getElementById('start').value)
  3 |     if (isNaN(startDate))
  4 |     	startDate = undefined
  5 |     let duration = parseInt(document.getElementById('duration').value)
  6 |     let finishDate = startDate == undefined || duration == undefined ? undefined : new Date(startDate.valueOf() + duration)
  7 |     let index = document.getElementById('index').value;
  8 |     if (isNaN(finishDate))
  9 | 	    finishDate = undefined
 10 |     drawChart2(index, startDate, finishDate);
 11 | }
 12 | 
 13 | function getMinAndMaxDate() {
 14 |     var index = document.getElementById('index').value
 15 |     let json = {
 16 |         aggs: {
 17 |             minDate : {
 18 |                 min : {
 19 |                     field : "start"
 20 |                 }
 21 |             },
 22 |             maxDate : {
 23 |                 max : {
 24 |                     field : "finish"
 25 |                 }
 26 |             }
 27 |         }
 28 |     }
 29 |     $.ajax({
 30 |         type: "POST",
 31 |         contentType: "application/json",
 32 |         dataType: "json",
 33 |         url: "http://localhost:9200/" + index + "/_search?size=0",
 34 |         data: JSON.stringify(json)
 35 |     }).done(results => {
 36 |         let minDate = document.getElementById("minDate")
 37 |         let maxDate = document.getElementById("maxDate")
 38 |         minDate.innerHTML = results["aggregations"]["minDate"]["value_as_string"]
 39 |         maxDate.innerHTML = results["aggregations"]["maxDate"]["value_as_string"]
 40 |     })
 41 | 
 42 | }
 43 | 
 44 | function getMin() {
 45 |     let minDate = document.getElementById("minDate")
 46 |     return minDate.innerHTML
 47 | }
 48 | 
 49 | function getMax() {
 50 |     let minDate = document.getElementById("maxDate")
 51 |     return minDate.innerHTML
 52 | }
 53 | 
 54 | function setStart(value) {
 55 |     let startDate = document.getElementById('start')
 56 |     startDate.value = value
 57 | }
 58 | 
 59 | function setFinish(value) {
 60 |     let startDate = document.getElementById('finish')
 61 |     startDate.value = value
 62 | }
 63 | 
 64 | function groupName(obj) {
 65 |     let index = obj["index"]
 66 |     let indexString = ""
 67 |     if (index < 10) {
 68 | 	    indexString = "0" + index
 69 |     } else {
 70 | 	    indexString = "" + index
 71 |     }
 72 |     return obj["hostname"]+"/" + indexString
 73 | }
 74 | 
 75 | function drawChart2(index, startDate, finishDate) {
 76 |     const batchsize = 10000
 77 |     const hits = []
 78 |     const json = {
 79 |         size: batchsize,
 80 |         query: {
 81 |             bool: {
 82 |             should: [
 83 |                 {
 84 |                 range:{
 85 |                     start: {
 86 |                     gte: startDate,
 87 |                     lte: finishDate
 88 |                     }
 89 |                 }
 90 |                 }, {
 91 |                 range: {
 92 |                     finish: {
 93 |                     gte: startDate,
 94 |                     lte: finishDate
 95 |                     }
 96 |                 }
 97 |                 }
 98 |             ],
 99 |             minimum_should_match: 1
100 |             }
101 |         }
102 |     }
103 | 
104 |     const handleResults = (sid, remaining, data) => {
105 |         data.forEach(h => {
106 |             hits.push(h["_source"])
107 |             remaining--
108 |         })
109 |         if(remaining !== 0) {
110 |             scroll(sid, remaining)
111 |         } else {
112 |             showTable(startDate, finishDate, hits)
113 |         }
114 |     }
115 |     
116 |     const scroll = (sid, remaining) => {
117 | 	let json = {
118 | 	    scroll: "1m",
119 | 	    scroll_id: sid
120 | 	}
121 | 	$.ajax({
122 |         type: "POST",
123 |         contentType: "application/json",
124 |         dataType: "json",
125 |         url: "http://localhost:9200/_search/scroll",
126 |         data: JSON.stringify(json)
127 | 	}).done(results => {
128 | 	    handleResults(results["_scroll_id"], remaining, results["hits"]["hits"])
129 | 	}).fail((a,b,c) => {
130 |         console.log(b)
131 |         console.log(c)
132 | 	})
133 |     }
134 |     
135 |     $.ajax({
136 |         type: "POST",
137 |         contentType: "application/json",
138 |         dataType: "json",
139 |         url: "http://localhost:9200/" + index + "/_search?scroll=1m",
140 |         data: JSON.stringify(json)
141 |     }).done(results => {
142 |         const data = results["hits"]
143 |         const total = data["total"]
144 |         handleResults(results["_scroll_id"], total, data["hits"])
145 |     }).fail((a,b,c) => {
146 |         console.log(b)
147 |         console.log(c)
148 |     })
149 | }
150 | 
151 | function showTable(startDate, finishDate, hits){
152 |     const container = document.getElementById("visualization")
153 |     const groupNames0 = new Set()
154 | 
155 |     hits.forEach(obj => {
156 |         groupNames0.add(groupName(obj))
157 |     })
158 | 
159 |     const groupNames = Array.from(groupNames0).sort()
160 |     const groups = new vis.DataSet()
161 |     const groupMap = {}
162 |     for(let g = 0; g < groupNames.length; g++) {
163 |         groups.add({id: g, content: groupNames[g]})
164 | 	    groupMap[groupNames[g]] = g
165 |     }
166 | 
167 |     const colorMap = {}
168 |     colorMap["irods_capability_automated_ingest.sync_task.sync_file"] = 'sync_file';
169 |     colorMap["irods_capability_automated_ingest.sync_task.sync_dir"] = 'sync_dir';
170 |     colorMap["irods_capability_automated_ingest.sync_task.sync_path"] = 'sync_path';
171 |     colorMap["irods_capability_automated_ingest.sync_task.restart"] = 'restart';
172 |     colorMap["list_dir"] = 'list_dir';
173 | 
174 |     let count = hits.length
175 |     document.getElementById("numEvents").innerHTML = count
176 |     
177 |     const items = new vis.DataSet()
178 |     hits.forEach((obj, index) => {
179 |         let task_id = obj["event_id"]
180 |         let task_name = obj["event_name"]
181 |         let start=obj["start"]
182 |         let finish=obj["finish"]
183 | 	let path=obj["path"]
184 | 	let target=obj["target"]
185 |         let taskStartDate = new Date(start)
186 |         let taskEndDate = new Date(finish)
187 |         items.add({
188 |             id: index,
189 |             group: groupMap[groupName(obj)],
190 |             content: task_id,
191 | 	    title: `${task_id}<br/>path: ${path}<br/>target: ${target}<br/>start: ${taskStartDate}<br/>finish: ${taskEndDate}`,
192 |             start: taskStartDate,
193 |             end: taskEndDate,
194 |             className: colorMap[task_name]
195 |         })
196 |     })
197 | 
198 |     let options = {
199 |         tooltip: {
200 |             overflowMethod: "cap"
201 |         },
202 |         moveable: true,
203 |         zoomable: true,
204 |         selectable: false,
205 |         showCurrentTime: false,
206 |         stack: false,
207 |         groupOrder: "content"
208 |     }
209 |     if(startDate !== undefined) {
210 | 	    options["min"] = startDate
211 | 	    options["start"] = startDate
212 | 	    options["end"] = new Date(startDate.valueOf() + 1000)
213 |     }
214 |     if(finishDate !== undefined) {
215 | 	    options["max"] = finishDate
216 |     }
217 |     
218 |     container.innerHTML = ""
219 |     let timeline = new vis.Timeline(container)
220 |     timeline.setOptions(options)
221 |     timeline.setGroups(groups)
222 |     timeline.setItems(items)
223 | }
224 | 


--------------------------------------------------------------------------------
/profile/profile.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | from elasticsearch import Elasticsearch
  4 | from elasticsearch.helpers import bulk
  5 | import argparse
  6 | 
  7 | parser = argparse.ArgumentParser(description='Ingest profile data into Elasticsearch')
  8 | parser.add_argument('input_file', metavar='INPUT FILE', type=str,
  9 |                     help='input file')
 10 | parser.add_argument('--elasticsearch_host', metavar='ELASTICSEARCH HOST', type=str, default="localhost",
 11 |                     help='elasticsearch host')
 12 | parser.add_argument('elasticsearch_index', metavar='ELASTICSEARCH INDEX', type=str,
 13 |                     help='elasticsearch index')
 14 | parser.add_argument('--additional_key', dest='keys', action='store', nargs="*", default=[],
 15 |                     help='additional key')
 16 | 
 17 | args = parser.parse_args()
 18 | 
 19 | input_file = args.input_file
 20 | keys = args.keys
 21 | output = args.elasticsearch_host
 22 | index = args.elasticsearch_index
 23 | 
 24 | es = Elasticsearch(output)
 25 | 
 26 | try:
 27 |     es.indices.create(index, body={
 28 |         "mappings": {
 29 |             "document": {
 30 |                 "properties": {
 31 |                     "hostname": {
 32 |                         "type": "keyword"
 33 |                     }
 34 |                 }
 35 |             }
 36 |         }
 37 |     })
 38 | except Exception as e:
 39 |     print(e)
 40 |     
 41 | def task_action():
 42 | 
 43 |     task_buf = {}
 44 |     task_counter = {}
 45 | 
 46 |     i = 0
 47 |     with open(input_file, "r") as f:
 48 |     
 49 |         line = f.readline().rstrip("\n")
 50 |         while line != "":
 51 |             obj = json.loads(line)
 52 | 
 53 |             event_id = obj["event_id"]
 54 |             # print(obj)
 55 |             buf = task_buf.get(event_id)
 56 |             if buf is None:
 57 |                 task_buf[event_id] = obj
 58 |             else:
 59 |                 del task_buf[event_id]
 60 |                 if obj["event"] == "task_prerun":
 61 |                     start = obj["@timestamp"]
 62 |                     finish = buf["@timestamp"]
 63 |                 else:
 64 |                     start = buf["@timestamp"]
 65 |                     finish = obj["@timestamp"]
 66 | 
 67 |                 event_name = obj["event_name"]
 68 |                 di = {
 69 |                     "start": start,
 70 |                     "finish": finish,
 71 |                     "hostname": obj["hostname"],
 72 |                     "index": obj["index"],
 73 |                     "event_name": event_name,
 74 |                     "event_id": obj["event_id"],
 75 |                     "path": obj.get("path"),
 76 |                     "target": obj.get("target")
 77 |                 }
 78 | 
 79 |                 for key in keys:
 80 |                     di[key] = obj[key]
 81 | 
 82 |                 d = {
 83 |                     "_index": index,
 84 |                     "_type": "document",
 85 |                     "_source": di
 86 |                 }
 87 |                 i += 1
 88 |                 print(i)
 89 |                 if event_name in task_counter:
 90 |                     task_counter[event_name] += 1
 91 |                 else:
 92 |                     task_counter[event_name] = 1
 93 |                 yield d
 94 |             line = f.readline().rstrip("\n")
 95 |     if len(task_buf) != 0:
 96 |         print(task_buf)
 97 | 
 98 |     print(task_counter)
 99 | 
100 | 
101 | bulk(es, task_action())
102 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | amqp==5.2.0
 2 | argon2-cffi==23.1.0
 3 | argon2-cffi-bindings==21.2.0
 4 | billiard==4.2.1
 5 | celery==5.4.0
 6 | certifi==2024.8.30
 7 | cffi==1.17.1
 8 | click==8.1.7
 9 | click-didyoumean==0.3.1
10 | click-plugins==1.1.1
11 | click-repl==0.3.0
12 | defusedxml==0.7.1
13 | irods-capability-automated-ingest==0.6.0
14 | kombu==5.4.2
15 | minio==7.2.10
16 | prettytable==3.11.0
17 | progressbar2==4.5.0
18 | prompt_toolkit==3.0.48
19 | pycparser==2.22
20 | pycryptodome==3.21.0
21 | python-dateutil==2.9.0.post0
22 | python-irodsclient==2.2.0
23 | python-redis-lock==4.0.0
24 | python-utils==3.9.0
25 | redis==4.6.0
26 | six==1.16.0
27 | structlog==24.4.0
28 | typing_extensions==4.12.2
29 | tzdata==2024.2
30 | urllib3==2.2.3
31 | vine==5.1.0
32 | wcwidth==0.2.13
33 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import codecs
 3 | from os import path
 4 | 
 5 | # Get package version
 6 | version = {}
 7 | here = path.abspath(path.dirname(__file__))
 8 | with open(path.join(here, "irods_capability_automated_ingest/version.py")) as f:
 9 |     exec(f.read(), version)
10 | 
11 | # Get the long description from the README file
12 | with codecs.open(path.join(here, "README.md"), "r", "utf-8") as f:
13 |     long_description = f.read()
14 | 
15 | setup(
16 |     name="irods-capability-automated-ingest",
17 |     version=version["__version__"],
18 |     description="Implement filesystem scanners and landing zones",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     url="https://github.com/irods/irods_capability_automated_ingest",
22 |     author="iRODS Consortium",
23 |     author_email="support@irods.org",
24 |     license="BSD",
25 |     python_requires=">=3.8,",
26 |     classifiers=[
27 |         "Development Status :: 4 - Beta",
28 |         "License :: OSI Approved :: BSD License",
29 |         "Natural Language :: English",
30 |         "Operating System :: POSIX :: Linux",
31 |         "Programming Language :: Python",
32 |         "Programming Language :: Python :: 3 :: Only",
33 |         "Programming Language :: Python :: 3",
34 |         "Programming Language :: Python :: 3.8",
35 |         "Programming Language :: Python :: 3.9",
36 |         "Programming Language :: Python :: 3.10",
37 |         "Programming Language :: Python :: 3.11",
38 |         "Programming Language :: Python :: 3.12",
39 |     ],
40 |     keywords="irods automated ingest landingzone filesystem",
41 |     packages=find_packages(),
42 |     install_requires=[
43 |         "minio",
44 |         "python-irodsclient<3.0.0",
45 |         "python-redis-lock>=3.2.0",
46 |         "redis>=3.4.1, <5.0.0",
47 |         "celery[redis]<6.0.0",
48 |         "structlog>=18.1.0",
49 |         "progressbar2",
50 |     ],
51 |     setup_requires=["setuptools>=38.6.0"],
52 |     entry_points={
53 |         "console_scripts": [
54 |             "irods_capability_automated_ingest=irods_capability_automated_ingest.irods_sync:main"
55 |         ],
56 |     },
57 |     project_urls={
58 |         "Bug Reports": "https://github.com/irods/irods_capability_automated_ingest/issues",
59 |         "Source": "https://github.com/irods/irods_capability_automated_ingest",
60 |     },
61 | )
62 | 


--------------------------------------------------------------------------------