├── Makefile.am ├── config.cfg.dist.rados ├── config.cfg.dist.s3 ├── .gitignore ├── src ├── Makefile.am ├── lhsmtool_rados.h ├── lhsmtool_s3.h ├── ct_common.h ├── ct_common.c ├── lhsmtool_rados.c └── lhsmtool_s3.c ├── patches └── libs3_low_speed_limit.patch ├── configure.ac ├── README.md └── LICENSE /Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS = foreign 2 | SUBDIRS = src 3 | -------------------------------------------------------------------------------- /config.cfg.dist.rados: -------------------------------------------------------------------------------- 1 | cluster_name = "ceph"; 2 | user_name = "client.admin"; 3 | pool_name = "hsm"; 4 | config_file = "/etc/ceph/ceph.conf"; 5 | chunk_size = 4194304; 6 | -------------------------------------------------------------------------------- /config.cfg.dist.s3: -------------------------------------------------------------------------------- 1 | access_key = ""; 2 | secret_key = ""; 3 | host = "localhost:80"; 4 | bucket_prefix = "lustre_hsm"; 5 | bucket_count = 8; 6 | chunk_size = 104857600; 7 | ssl = false 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | Makefile.in 3 | aclocal.m4 4 | autom4te.cache/ 5 | config.cfg 6 | config.log 7 | config.status 8 | configure 9 | depcomp 10 | install-sh 11 | missing 12 | src/.deps/ 13 | src/copytoolRados 14 | src/copytoolS3 15 | src/*.o 16 | src/*.a 17 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | #CFLAGS = --pedantic -Wall 2 | #LDFLAGS = 3 | AM_CFLAGS = -Wall -I@LDIR@/libcfs/include -I@LDIR@/lustre/include -I@LDIR@/lnet/include 4 | 5 | noinst_LIBRARIES = libct.a 6 | libct_a_SOURCES = ct_common.c ct_common.h 7 | 8 | bin_PROGRAMS = 9 | if ENABLE_S3 10 | bin_PROGRAMS += copytoolS3 11 | endif 12 | copytoolS3_SOURCES = lhsmtool_s3.c lhsmtool_s3.h ct_common.h 13 | copytoolS3_LDADD = libct.a 14 | 15 | if ENABLE_RADOS 16 | bin_PROGRAMS += copytoolRados 17 | endif 18 | copytoolRados_SOURCES = lhsmtool_rados.c lhsmtool_rados.h ct_common.h 19 | copytoolRados_LDADD = libct.a 20 | -------------------------------------------------------------------------------- /patches/libs3_low_speed_limit.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/request.c b/src/request.c 2 | index 2c2a384..77b1319 100644 3 | --- a/src/request.c 4 | +++ b/src/request.c 5 | @@ -907,8 +907,8 @@ static S3Status setup_curl(Request *request, 6 | // less than 1K per second for more than 15 seconds. 7 | // xxx todo - make these configurable 8 | // xxx todo - allow configurable max send and receive speed 9 | - curl_easy_setopt_safe(CURLOPT_LOW_SPEED_LIMIT, 1024); 10 | - curl_easy_setopt_safe(CURLOPT_LOW_SPEED_TIME, 15); 11 | + //curl_easy_setopt_safe(CURLOPT_LOW_SPEED_LIMIT, 1024); 12 | + //curl_easy_setopt_safe(CURLOPT_LOW_SPEED_TIME, 15); 13 | 14 | // Append standard headers 15 | #define append_standard_header(fieldName) \ 16 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ(2.59) 2 | AC_INIT(lustre-s3-copytool, 0.1) 3 | AC_CONFIG_SRCDIR([src/lhsmtool_s3.c]) 4 | AM_INIT_AUTOMAKE 5 | 6 | #Enable silent build by default 7 | m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) 8 | 9 | #AC_PROG_MKDIR_P 10 | AC_PROG_CC 11 | AC_PROG_RANLIB 12 | 13 | AC_CHECK_LIB(lz4, LZ4_compressBound) 14 | AC_CHECK_LIB(pthread, pthread_exit) 15 | AC_CHECK_LIB(config, config_init) 16 | AC_CHECK_LIB(crypto, MD5_Init) 17 | AC_CHECK_LIB(lustreapi, llapi_error) 18 | 19 | AC_ARG_WITH( [lustre], AS_HELP_STRING([--with-lustre=],[indicate alternative location for lustre sources]), 20 | LDIR="$withval", LDIR="lustre-release") 21 | AC_CHECK_FILE("$LDIR/lustre/include/lustre/lustre_idl.h",, 22 | AC_MSG_ERROR("Could not find Lustre source tree. Use --with-lustre=")) 23 | 24 | AC_ARG_ENABLE( [rados], [AS_HELP_STRING([--enable-rados], [build with rados])], 25 | [with_rados=yes],[]) 26 | AC_ARG_ENABLE( [s3], [AS_HELP_STRING([--enable-s3], [build with s3])], 27 | [with_s3=yes],[]) 28 | 29 | AS_IF([test "x$with_rados" == "xyes"], [ 30 | AC_CHECK_LIB([rados], [rados_create2]) 31 | ]) 32 | AS_IF([test "x$with_s3" == "xyes"], [ 33 | AC_CHECK_LIB([s3], [S3_initialize]) 34 | ]) 35 | 36 | AM_CONDITIONAL([ENABLE_S3], [test "$with_s3" = "yes"]) 37 | AM_CONDITIONAL([ENABLE_RADOS], [test "$with_rados" = "yes"]) 38 | 39 | AC_SUBST(LDIR) 40 | AC_CONFIG_FILES(Makefile src/Makefile) 41 | AC_OUTPUT 42 | 43 | -------------------------------------------------------------------------------- /src/lhsmtool_rados.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* HSM copytool program for rados (ceph) object storage. 32 | * 33 | * An HSM copytool daemon acts on action requests from Lustre to copy files 34 | * to and from an HSM archive system. 35 | * 36 | */ 37 | 38 | #ifndef _GNU_SOURCE 39 | #define _GNU_SOURCE 40 | #endif 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | // number of ascii characters needed to represent the size of a big file 49 | #define TOTALLENGTH 24 50 | 51 | #define RADOS_PARAMETER_LENGTH 128 52 | #define OID_LENGTH 128 53 | 54 | static int err_major; 55 | //static int err_minor; 56 | 57 | char cluster_name[RADOS_PARAMETER_LENGTH]; 58 | char user_name[RADOS_PARAMETER_LENGTH]; 59 | char pool_name[RADOS_PARAMETER_LENGTH]; 60 | char rados_config_file[RADOS_PARAMETER_LENGTH]; 61 | rados_t cluster; 62 | rados_ioctx_t io; 63 | 64 | static int init_rados(); 65 | 66 | static void usage(const char *name, int rc); 67 | 68 | static int ct_parseopts(int argc, char * const *argv); 69 | 70 | static int ct_archive_data(struct hsm_copyaction_private *hcp, const char *src, 71 | const char *dst, int src_fd, 72 | const struct hsm_action_item *hai, long hal_flags); 73 | 74 | static int ct_restore_data(struct hsm_copyaction_private *hcp, const char *src, 75 | const char *dst, int dst_fd, 76 | const struct hsm_action_item *hai, long hal_flags); 77 | -------------------------------------------------------------------------------- /src/lhsmtool_s3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* HSM copytool program for S3 object storage. 32 | * 33 | * An HSM copytool daemon acts on action requests from Lustre to copy files 34 | * to and from an HSM archive system. 35 | * 36 | */ 37 | 38 | #ifndef _GNU_SOURCE 39 | #define _GNU_SOURCE 40 | #endif 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | 49 | #define RETRYCOUNT 5 50 | 51 | #define MD5_ASCII 32+1 52 | 53 | //struct ct_s3_options { 54 | // int o_copy_attrs; 55 | // int o_daemonize; 56 | // int o_dry_run; 57 | // int o_abort_on_error; 58 | // int o_verbose; 59 | // int o_archive_cnt; 60 | // int o_archive_id[LL_HSM_MAX_ARCHIVE]; 61 | // int o_report_int; 62 | // char *o_config; 63 | // char *o_event_fifo; 64 | // char *o_mnt; 65 | // int o_mnt_fd; 66 | //}; 67 | 68 | char access_key[S3_MAX_KEY_SIZE]; 69 | char secret_key[S3_MAX_KEY_SIZE]; 70 | char host[S3_MAX_HOSTNAME_SIZE]; 71 | char bucket_prefix[S3_MAX_BUCKET_NAME_SIZE]; 72 | 73 | int bucket_count; 74 | 75 | #ifndef MIN_SLEEP_SECOND 76 | #define MIN_SLEEP_SECOND 1 77 | #endif 78 | 79 | S3BucketContext bucketContext = 80 | { 81 | host, 82 | bucket_prefix, 83 | S3ProtocolHTTP, 84 | S3UriStylePath, 85 | access_key, 86 | secret_key 87 | }; 88 | 89 | typedef struct put_object_callback_data 90 | { 91 | long long unsigned int contentLength; 92 | long long unsigned int buffer_offset; 93 | S3Status status; 94 | char *buffer; 95 | } put_object_callback_data; 96 | 97 | typedef struct get_object_callback_data 98 | { 99 | long long unsigned int buffer_offset; 100 | long long unsigned int totalLength; 101 | long long unsigned int chunk_size; 102 | long long unsigned int contentLength; 103 | char *buffer; 104 | S3Status status; 105 | char md5[MD5_ASCII]; 106 | } get_object_callback_data; 107 | 108 | S3Status responsePropertiesCallback(const S3ResponseProperties *properties, 109 | void *callbackData); 110 | static void getResponseCompleteCallback(S3Status status, 111 | const S3ErrorDetails *error, 112 | void *callbackData); 113 | 114 | static void putResponseCompleteCallback(S3Status status, 115 | const S3ErrorDetails *error, 116 | void *callbackData); 117 | S3ResponseHandler getResponseHandler = 118 | { 119 | &responsePropertiesCallback, 120 | &getResponseCompleteCallback 121 | }; 122 | 123 | S3ResponseHandler putResponseHandler = 124 | { 125 | &responsePropertiesCallback, 126 | &putResponseCompleteCallback 127 | }; 128 | 129 | S3ResponseHandler headResponseHandler = 130 | { 131 | &responsePropertiesCallback, 132 | &getResponseCompleteCallback 133 | }; 134 | 135 | S3ResponseHandler deleteResponseHandler = 136 | { 137 | &responsePropertiesCallback, 138 | &getResponseCompleteCallback 139 | }; 140 | 141 | static int putObjectDataCallback(int bufferSize, char *buffer, 142 | void *callbackData); 143 | 144 | static S3Status getObjectDataCallback(int bufferSize, const char *buffer, 145 | void *callbackData); 146 | 147 | static void getBucketName(int bucketNameSize, char *bucketName, 148 | char *objectName); 149 | 150 | static int get_s3_object(char *objectName, get_object_callback_data *data, 151 | S3GetObjectHandler *getObjectHandler); 152 | 153 | static void usage(const char *name, int rc); 154 | 155 | static int ct_parseopts(int argc, char * const *argv); 156 | 157 | static int ct_archive_data(struct hsm_copyaction_private *hcp, const char *src, 158 | const char *dst, int src_fd, 159 | const struct hsm_action_item *hai, long hal_flags); 160 | 161 | static int ct_restore_data(struct hsm_copyaction_private *hcp, const char *src, 162 | const char *dst, int dst_fd, 163 | const struct hsm_action_item *hai, long hal_flags); 164 | 165 | static int ct_s3_cleanup(void); 166 | 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lustre-s3-copytool 2 | ================== 3 | A fork of this tool, called [Estuary](https://git.ichec.ie/performance/storage/estuary) is now available for a better S3 support. Estuary is preferred for new deployment. 4 | 5 | Features 6 | -------- 7 | 8 | - Compression with LZ4 9 | - Stripping across multiple objects and buckets 10 | - Verifing checksum when restoring 11 | 12 | Patching libs3 13 | -------------- 14 | The copytool will segfault if libs3 is not patched, the fault mostly occur in 15 | large files restore (Writing on Lustre/Reading on S3). 16 | 17 | This patch remove the low speed limit, that limit will normally cause a timeout 18 | if the transfer is slower than 1kb/s for 15 seconds. 19 | 20 | ``` 21 | $ git clone https://github.com/bji/libs3.git 22 | $ cd libs3 23 | $ patch -p1 < ~/lustre-obj-copytool/patches/libs3_low_speed_limit.patch 24 | ``` 25 | 26 | Building the copytool 27 | --------------------- 28 | After downloading the sources from the git repository, the autotools must be 29 | run to prepare the build environment : 30 | 31 | ``` 32 | $ aclocal 33 | $ automake --add-missing 34 | $ autoconf 35 | ``` 36 | 37 | The build can then be configured with the `configure` script. The location 38 | of the lustre source tree to compile against must be specified as well as 39 | the type of object interfaces to enable (s3 and/or rados): 40 | 41 | ``` 42 | $ ./configure --with-lustre= --enable-s3 43 | ``` 44 | 45 | After running `make`, the binary of the copytool will be in ./src/ 46 | 47 | Configuration 48 | ------------- 49 | ### General 50 | 51 | | Parameter | Type | Description | 52 | |-----------|------|-------------| 53 | | Chunk_size | Int | This represent the size of the largest object stored. A large file in Lustre will be stripped in multiple objects if the file size > chunk_size. Because compression is used, this parameter need to be set according to the available memory. Each thread will use twice the chunk_size. For incompressible data, each object will take a few extra bytes. | 54 | 55 | ### copytoolS3 56 | A basic configuration is available in config.cfg.dist.s3, this file can be copied 57 | as config.cfg. The path of the config file can also be passed as a runtime 58 | parameter. 59 | 60 | | Parameter | Type | Description | 61 | |-----------|------|-------------| 62 | | access_key | String | AWS access key. | 63 | | secret_key | String | AWS Secret key. | 64 | | host | String | Hostname of the S3 endpoint. | 65 | | bucket_count | Int | The number of buckets used to spread the indexing load. With radosgw, PUT operation will slow down proportionally to the number of objects in the same bucket. If a bucket_count > 2 is used, the bucket_prefix will be appended an ID. | 66 | | bucket_prefix | String | This prefix will prepended to each bucketID. For example, if the bucket_prefix is __hsm__, then each bucket will named hsm_0, hsm_1, hsm_2 ... | 67 | | ssl | Bool | If the S3 endpoint should use SSL. | 68 | 69 | ### copytoolRados 70 | A basic configuration is available in config.cfg.dist.rados, this file can be copied 71 | as config.cfg. The path of the config file can also be passed as a runtime 72 | parameter. 73 | 74 | | Parameter | Type | Description | 75 | |-----------|------|-------------| 76 | | cluster_name | String | To select between multiple Ceph clusters | 77 | | user_name | String | Select a key allowed to connect to the Ceph cluster and the HSM pool. | 78 | | pool_name | String | A dedicated pool to store the objects created by HSM, can be erasure encoded. | 79 | | config_file | String | Path to ceph.conf | 80 | | chunk_size | String | A safe value used in Ceph RBD is 4MiB but it could be increased for this copytool. | 81 | 82 | Lustre HSM 83 | ---------- 84 | ### Quick howto 85 | Enable HSM on the MDS server 86 | 87 | ``` 88 | # lctl set_param mdt.lustre-MDT0000.hsm.max_requests=10 89 | # lctl set_param mdt.lustre-MDT0000.hsm_control=enabled 90 | ``` 91 | 92 | Start the copytool on a DTN node 93 | 94 | ``` 95 | # ./copytoolS3 /lustre/ 96 | 1456506103.926649 copytool_d[31507]: mount_point=/lustre/ 97 | 1456506103.932785 copytool_d[31507]: waiting for message from kernel 98 | ``` 99 | You can use `lfs hsm_state` to get the current status of a file 100 | 101 | Move a file to HSM and remove it from Lustre 102 | 103 | ``` 104 | # lfs hsm_state test 105 | test: (0x00000000) 106 | # lfs hsm_archive test 107 | # lfs hsm_state 108 | test: (0x00000009) exists archived, archive_id:1 109 | # lfs hsm_release test 110 | # lfs hsm_state test 111 | test: (0x0000000d) released exists archived, archive_id:1 112 | ``` 113 | 114 | Restore the file implicitly 115 | 116 | ``` 117 | # md5sum test 118 | 33e3e3bdb7f6f847e06ae2a8abad0b85 test 119 | # lfs hsm_state test 120 | test: (0x00000009) exists archived, archive_id:1 121 | ``` 122 | 123 | Remove the file from S3 124 | 125 | ``` 126 | # lfs hsm_remove test 127 | # lfs hsm_state test 128 | test: (0x00000000), archive_id:1 129 | ``` 130 | 131 | ### Example with radosgw (S3) 132 | Install radosgw as usual (http://docs.ceph.com/docs/master/radosgw/). The pool __.rgw.buckets__ can be erasure coded. 133 | 134 | Create the user used by HSM 135 | 136 | ``` 137 | # radosgw-admin user create --uid=lustre_hsm --display-name="lustre_hsm" 138 | [...] 139 | # radosgw-admin user modify --uid=lustre_hsm --max_buckets=10000 140 | [...] 141 | ``` 142 | 143 | Grab the access_key and the secret_key from the previous command 144 | 145 | Install and configure `s3cmd` (can also use any other s3 compatible tool) 146 | 147 | Create some buckets with `s3cmd` 148 | 149 | ``` 150 | # for i in {0..256} ; do s3cmd mb s3://lustre_hsm_$i ; done 151 | Bucket 's3://lustre_hsm_0/' created 152 | [...] 153 | ``` 154 | 155 | Update the config.cfg 156 | 157 | Start `copytoolS3` 158 | Try it using the quick howto above 159 | 160 | ### Example with rados (native ceph protocol) 161 | 162 | Create a pool dedicated to the HSM storage, this can be a erasure coded pool. 163 | 164 | ``` 165 | # ceph osd pool create hsm 1024 166 | ``` 167 | Update the config.cfg 168 | 169 | start `copytoolRados` 170 | 171 | Try it using the quick howto above 172 | -------------------------------------------------------------------------------- /src/ct_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* 32 | * A library to encapsulate functions and data structures to be reuse 33 | * by HSM copytool daemons for Lustre 34 | */ 35 | 36 | #ifndef _GNU_SOURCE 37 | #define _GNU_SOURCE 38 | #endif 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #define ONE_MB 0x100000 47 | 48 | // number of ascii characters needed to represent the size of a big file 49 | #define TOTALLENGTH 24 50 | 51 | #define MD5_ASCII 32+1 52 | 53 | /* HSM hash subdir permissions */ 54 | #define DIR_PERM S_IRWXU 55 | /* HSM hash file permissions */ 56 | #define FILE_PERM (S_IRUSR | S_IWUSR)/* HSM hash subdir permissions */ 57 | #define DIR_PERM S_IRWXU 58 | /* HSM hash file permissions */ 59 | #define FILE_PERM (S_IRUSR | S_IWUSR) 60 | 61 | #ifndef _CMD_NAME 62 | #define _CMD_NAME 63 | char cmd_name[PATH_MAX]; 64 | char fs_name[MAX_OBD_NAME + 1]; 65 | #endif 66 | 67 | int chunk_size; 68 | static int err_major; 69 | //static int err_minor; 70 | 71 | #define CT_ERROR(_rc, _format, ...) \ 72 | llapi_error(LLAPI_MSG_ERROR, _rc, \ 73 | "%f %s[%ld]: "_format, \ 74 | ct_now(), cmd_name, syscall(SYS_gettid), ## __VA_ARGS__) 75 | 76 | #define CT_DEBUG(_format, ...) \ 77 | llapi_error(LLAPI_MSG_DEBUG | LLAPI_MSG_NO_ERRNO, 0, \ 78 | "%f %s[%ld]: "_format, \ 79 | ct_now(), cmd_name, syscall(SYS_gettid), ## __VA_ARGS__) 80 | 81 | #define CT_WARN(_format, ...) \ 82 | llapi_error(LLAPI_MSG_WARN | LLAPI_MSG_NO_ERRNO, 0, \ 83 | "%f %s[%ld]: "_format, \ 84 | ct_now(), cmd_name, syscall(SYS_gettid), ## __VA_ARGS__) 85 | 86 | #define CT_TRACE(_format, ...) \ 87 | llapi_error(LLAPI_MSG_INFO | LLAPI_MSG_NO_ERRNO, 0, \ 88 | "%f %s[%ld]: "_format, \ 89 | ct_now(), cmd_name, syscall(SYS_gettid), ## __VA_ARGS__) 90 | 91 | #ifndef MIN_SLEEP_SECOND 92 | #define MIN_SLEEP_SECOND 1 93 | #endif 94 | 95 | /* Progress reporting period */ 96 | #define REPORT_INTERVAL_DEFAULT 30 97 | 98 | struct ct_options { 99 | int o_copy_attrs; 100 | int o_daemonize; 101 | int o_dry_run; 102 | int o_abort_on_error; 103 | int o_verbose; 104 | int o_archive_cnt; 105 | int o_archive_id[LL_HSM_MAX_ARCHIVE]; 106 | int o_report_int; 107 | char *o_config; 108 | char *o_event_fifo; 109 | char *o_mnt; 110 | int o_mnt_fd; 111 | }; 112 | 113 | struct ct_th_data { 114 | struct hsm_action_item *hai; 115 | long hal_flags; 116 | }; 117 | 118 | /* 119 | * Basic struct to store a file's stripe size and stripe count 120 | */ 121 | typedef struct strippingInfo 122 | { 123 | __u32 lmm_stripe_size; 124 | __u16 lmm_stripe_count; 125 | } strippingInfo; 126 | 127 | /* hsm_copytool_private will hold an open FD on the lustre mount point 128 | * for us. This is to make sure it doesn't drop out from under us (and 129 | * remind the admin to shutdown the copytool before unmounting). */ 130 | struct hsm_copytool_private *ctdata; 131 | 132 | 133 | /* 134 | * ct_archive, ct_restore, ct_remove & ct_cancel are declared here but must be defined 135 | * by the implemented by the user of libct as they are specific to a 136 | * given archival platform. 137 | */ 138 | int ct_archive(const struct hsm_action_item *hai, const long hal_flags); 139 | int ct_restore(const struct hsm_action_item *hai, const long hal_flags); 140 | int ct_remove(const struct hsm_action_item *hai, const long hal_flags); 141 | int ct_cancel(const struct hsm_action_item *hai, const long hal_flags); 142 | 143 | int should_retry(int *retry_count); 144 | 145 | // djb2 hash function for strings 146 | // http://www.cse.yorku.ca/~oz/hash.html 147 | unsigned long hash(char* str); 148 | 149 | /* 150 | * Return current time in sec since epoch 151 | */ 152 | inline double ct_now(void); 153 | 154 | /* 155 | * For a given FD, find out the stripe size and stripe count and 156 | * return them in a pre-allocated strippingInfo struct 157 | */ 158 | int ct_save_stripe(int src_fd, const char *src, strippingInfo *params); 159 | 160 | int ct_path_lustre(char *buf, int sz, const char *mnt, const lustre_fid *fid); 161 | int ct_path_archive(char *buf, int sz, const lustre_fid *fid); 162 | bool ct_is_retryable(int err); 163 | 164 | /* 165 | * Notify the coordinator that an action was completed 166 | */ 167 | int ct_action_done(struct hsm_copyaction_private **phcp, 168 | const struct hsm_action_item *hai, int hp_flags, int ct_rc); 169 | 170 | /* 171 | * Notify the coordinator that an action is starting 172 | * ct_begin is only a wrapper around ct_begin_restore 173 | */ 174 | int ct_begin(struct hsm_copyaction_private **phcp, const struct hsm_action_item *hai); 175 | int ct_begin_restore(struct hsm_copyaction_private **phcp, 176 | const struct hsm_action_item *hai, 177 | int mdt_index, int open_flags); 178 | 179 | /* 180 | * Trigger cleanup when receiving signal 181 | */ 182 | void handler(int signal); 183 | 184 | int ct_setup(void); 185 | 186 | int ct_cleanup(void); 187 | 188 | int ct_process_item(struct hsm_action_item *hai, const long hal_flags); 189 | 190 | void *ct_thread(void *data); 191 | 192 | int ct_process_item_async(const struct hsm_action_item *hai, long hal_flags); 193 | 194 | /* Daemon waits for messages from the kernel; run it in the background. */ 195 | int ct_run(void); 196 | -------------------------------------------------------------------------------- /src/ct_common.c: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* 32 | * A library to encapsulate functions and data structures to be reuse 33 | * by HSM copytool daemons for Lustre 34 | */ 35 | 36 | #ifndef _GNU_SOURCE 37 | #define _GNU_SOURCE 38 | #endif 39 | 40 | #include "ct_common.h" 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | /* everything else is zeroed */ 52 | struct ct_options ct_opt = { 53 | .o_verbose = LLAPI_MSG_INFO, 54 | .o_report_int = REPORT_INTERVAL_DEFAULT, 55 | .o_config = "config.cfg", 56 | }; 57 | 58 | int should_retry(int *retry_count) 59 | { 60 | if ((*retry_count)--) { 61 | // Sleep before next retry; start out with a 1 second sleep 62 | static int retrySleepInterval = MIN_SLEEP_SECOND; 63 | sleep(retrySleepInterval); 64 | // Next sleep 1 second longer 65 | ++retrySleepInterval; 66 | return 1; 67 | } 68 | 69 | return 0; 70 | } 71 | 72 | // djb2 hash function for strings 73 | // http://www.cse.yorku.ca/~oz/hash.html 74 | unsigned long hash(char* str) 75 | { 76 | unsigned long hash = 5381; 77 | int c; 78 | while ((c = *str++) != 0) 79 | hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ 80 | return hash; 81 | } 82 | 83 | 84 | inline double ct_now(void) 85 | { 86 | struct timeval tv; 87 | gettimeofday(&tv, NULL); 88 | return tv.tv_sec + 0.000001 * tv.tv_usec; 89 | } 90 | 91 | int ct_save_stripe(int src_fd, const char *src, strippingInfo *params) 92 | { 93 | char lov_buf[XATTR_SIZE_MAX]; 94 | struct lov_user_md *lum; 95 | int rc; 96 | ssize_t xattr_size; 97 | 98 | assert(src && params); 99 | 100 | CT_TRACE("saving stripe info of '%s'", src); 101 | 102 | xattr_size = fgetxattr(src_fd, XATTR_LUSTRE_LOV, lov_buf, sizeof(lov_buf)); 103 | if (xattr_size < 0) { 104 | rc = -errno; 105 | CT_ERROR(rc, "cannot get stripe info on '%s'", src); 106 | return rc; 107 | } 108 | 109 | lum = (struct lov_user_md *)lov_buf; 110 | 111 | params->lmm_stripe_size = lum->lmm_stripe_size; 112 | params->lmm_stripe_count = lum->lmm_stripe_count; 113 | 114 | return 0; 115 | } 116 | 117 | int ct_path_lustre(char *buf, int sz, const char *mnt, 118 | const lustre_fid *fid) 119 | { 120 | return snprintf(buf, sz, "%s/%s/fid/"DFID_NOBRACE, mnt, 121 | dot_lustre_name, PFID(fid)); 122 | } 123 | 124 | int ct_path_archive(char *buf, int sz, const lustre_fid *fid) 125 | { 126 | __u64 sequence_id = (fid)->f_seq; 127 | __u32 object_id = (fid)->f_oid; 128 | __u32 version = (fid)->f_ver; 129 | return snprintf(buf, sz, "%016llx_%08x_%08x", 130 | sequence_id, object_id, version); 131 | } 132 | 133 | bool ct_is_retryable(int err) 134 | { 135 | return err == -ETIMEDOUT; 136 | } 137 | 138 | int ct_action_done(struct hsm_copyaction_private **phcp, 139 | const struct hsm_action_item *hai, int hp_flags, int ct_rc) 140 | { 141 | struct hsm_copyaction_private *hcp; 142 | char lstr[PATH_MAX]; 143 | int rc; 144 | 145 | assert(hai); 146 | 147 | CT_TRACE("Action completed, notifying coordinator " 148 | "cookie="LPX64", FID="DFID", hp_flags=%d err=%d", 149 | hai->hai_cookie, PFID(&hai->hai_fid), 150 | hp_flags, -ct_rc); 151 | 152 | ct_path_lustre(lstr, sizeof(lstr), ct_opt.o_mnt, &hai->hai_fid); 153 | 154 | if (phcp == NULL || *phcp == NULL) { 155 | rc = llapi_hsm_action_begin(&hcp, ctdata, hai, -1, 0, true); 156 | if (rc < 0) { 157 | CT_ERROR(rc, "llapi_hsm_action_begin() on '%s' failed", 158 | lstr); 159 | return rc; 160 | } 161 | phcp = &hcp; 162 | } 163 | 164 | rc = llapi_hsm_action_end(phcp, &hai->hai_extent, hp_flags, abs(ct_rc)); 165 | if (rc == -ECANCELED) 166 | CT_ERROR(rc, "completed action on '%s' has been canceled: " 167 | "cookie="LPX64", FID="DFID, lstr, hai->hai_cookie, 168 | PFID(&hai->hai_fid)); 169 | else if (rc < 0) 170 | CT_ERROR(rc, "llapi_hsm_action_end() on '%s' failed", lstr); 171 | else 172 | CT_TRACE("llapi_hsm_action_end() on '%s' ok (rc=%d)", 173 | lstr, rc); 174 | 175 | return rc; 176 | } 177 | 178 | void handler(int signal) 179 | { 180 | psignal(signal, "exiting"); 181 | /* If we don't clean up upon interrupt, umount thinks there's a ref 182 | * and doesn't remove us from mtab (EINPROGRESS). The lustre client 183 | * does successfully unmount and the mount is actually gone, but the 184 | * mtab entry remains. So this just makes mtab happier. */ 185 | llapi_hsm_copytool_unregister(&ctdata); 186 | 187 | /* Also remove fifo upon signal as during normal/error exit */ 188 | if (ct_opt.o_event_fifo != NULL) 189 | llapi_hsm_unregister_event_fifo(ct_opt.o_event_fifo); 190 | _exit(1); 191 | } 192 | 193 | int ct_begin_restore(struct hsm_copyaction_private **phcp, 194 | const struct hsm_action_item *hai, 195 | int mdt_index, int open_flags) 196 | { 197 | char src[PATH_MAX]; 198 | int rc; 199 | 200 | assert(hai); 201 | 202 | rc = llapi_hsm_action_begin(phcp, ctdata, hai, mdt_index, open_flags, 203 | false); 204 | if (rc < 0) { 205 | ct_path_lustre(src, sizeof(src), ct_opt.o_mnt, &hai->hai_fid); 206 | CT_ERROR(rc, "llapi_hsm_action_begin() on '%s' failed", src); 207 | } 208 | 209 | return rc; 210 | } 211 | 212 | int ct_begin(struct hsm_copyaction_private **phcp, const struct hsm_action_item *hai) 213 | { 214 | /* Restore takes specific parameters. Call the same function w/ default 215 | * values for all other operations. */ 216 | return ct_begin_restore(phcp, hai, -1, 0); 217 | } 218 | 219 | int ct_setup(void) 220 | { 221 | int rc; 222 | 223 | /* set llapi message level */ 224 | llapi_msg_set_level(ct_opt.o_verbose); 225 | 226 | rc = llapi_search_fsname(ct_opt.o_mnt, fs_name); 227 | if (rc < 0) { 228 | CT_ERROR(rc, "cannot find a Lustre filesystem mounted at '%s'", 229 | ct_opt.o_mnt); 230 | return rc; 231 | } 232 | 233 | ct_opt.o_mnt_fd = open(ct_opt.o_mnt, O_RDONLY); 234 | if (ct_opt.o_mnt_fd < 0) { 235 | rc = -errno; 236 | CT_ERROR(rc, "cannot open mount point at '%s'", 237 | ct_opt.o_mnt); 238 | return rc; 239 | } 240 | 241 | return rc; 242 | } 243 | 244 | int ct_cleanup(void) 245 | { 246 | int rc; 247 | 248 | if (ct_opt.o_mnt_fd >= 0) { 249 | rc = close(ct_opt.o_mnt_fd); 250 | if (rc < 0) { 251 | rc = -errno; 252 | CT_ERROR(rc, "cannot close mount point"); 253 | return rc; 254 | } 255 | } 256 | 257 | return 0; 258 | } 259 | 260 | int ct_process_item(struct hsm_action_item *hai, const long hal_flags) 261 | { 262 | int rc = 0; 263 | assert(hai); 264 | 265 | if (ct_opt.o_verbose >= LLAPI_MSG_INFO || ct_opt.o_dry_run) { 266 | /* Print the original path */ 267 | char fid[128]; 268 | char path[PATH_MAX]; 269 | long long recno = -1; 270 | int linkno = 0; 271 | 272 | sprintf(fid, DFID, PFID(&hai->hai_fid)); 273 | CT_TRACE("'%s' action %s reclen %d, cookie="LPX64, 274 | fid, hsm_copytool_action2name(hai->hai_action), 275 | hai->hai_len, hai->hai_cookie); 276 | rc = llapi_fid2path(ct_opt.o_mnt, fid, path, 277 | sizeof(path), &recno, &linkno); 278 | if (rc < 0) 279 | CT_ERROR(rc, "cannot get path of FID %s", fid); 280 | else 281 | CT_TRACE("processing file '%s'", path); 282 | } 283 | 284 | switch (hai->hai_action) { 285 | /* set err_major, minor inside these functions */ 286 | case HSMA_ARCHIVE: 287 | rc = ct_archive(hai, hal_flags); 288 | break; 289 | case HSMA_RESTORE: 290 | rc = ct_restore(hai, hal_flags); 291 | break; 292 | case HSMA_REMOVE: 293 | rc = ct_remove(hai, hal_flags); 294 | break; 295 | case HSMA_CANCEL: 296 | rc = ct_cancel(hai, hal_flags); 297 | break; 298 | default: 299 | rc = -EINVAL; 300 | CT_ERROR(rc, "unknown action %d, on '%s'", hai->hai_action, 301 | ct_opt.o_mnt); 302 | ct_action_done(NULL, hai, 0, rc); 303 | } 304 | 305 | return 0; 306 | } 307 | 308 | void *ct_thread(void *data) 309 | { 310 | struct ct_th_data *cttd = data; 311 | int rc; 312 | 313 | rc = ct_process_item(cttd->hai, cttd->hal_flags); 314 | 315 | free(cttd->hai); 316 | free(cttd); 317 | pthread_exit((void *)(intptr_t)rc); 318 | } 319 | 320 | int ct_process_item_async(const struct hsm_action_item *hai, 321 | long hal_flags) 322 | { 323 | pthread_attr_t attr; 324 | pthread_t thread; 325 | struct ct_th_data *data; 326 | int rc; 327 | assert(hai); 328 | 329 | data = malloc(sizeof(*data)); 330 | if (data == NULL) 331 | return -ENOMEM; 332 | 333 | data->hai = malloc(hai->hai_len); 334 | if (data->hai == NULL) { 335 | free(data); 336 | return -ENOMEM; 337 | } 338 | 339 | memcpy(data->hai, hai, hai->hai_len); 340 | data->hal_flags = hal_flags; 341 | 342 | rc = pthread_attr_init(&attr); 343 | if (rc != 0) { 344 | CT_ERROR(rc, "pthread_attr_init failed for '%s' service", 345 | ct_opt.o_mnt); 346 | free(data->hai); 347 | free(data); 348 | return -rc; 349 | } 350 | 351 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); 352 | 353 | rc = pthread_create(&thread, &attr, ct_thread, data); 354 | if (rc != 0) 355 | CT_ERROR(rc, "cannot create thread for '%s' service", 356 | ct_opt.o_mnt); 357 | 358 | pthread_attr_destroy(&attr); 359 | return 0; 360 | } 361 | 362 | /* Daemon waits for messages from the kernel; run it in the background. */ 363 | int ct_run(void) 364 | { 365 | int rc; 366 | 367 | if (ct_opt.o_daemonize) { 368 | rc = daemon(1, 1); 369 | if (rc < 0) { 370 | rc = -errno; 371 | CT_ERROR(rc, "cannot daemonize"); 372 | return rc; 373 | } 374 | } 375 | 376 | setbuf(stdout, NULL); 377 | 378 | if (ct_opt.o_event_fifo != NULL) { 379 | rc = llapi_hsm_register_event_fifo(ct_opt.o_event_fifo); 380 | if (rc < 0) { 381 | CT_ERROR(rc, "failed to register event fifo"); 382 | return rc; 383 | } 384 | llapi_error_callback_set(llapi_hsm_log_error); 385 | } 386 | 387 | rc = llapi_hsm_copytool_register(&ctdata, ct_opt.o_mnt, 388 | ct_opt.o_archive_cnt, 389 | ct_opt.o_archive_id, 0); 390 | if (rc < 0) { 391 | CT_ERROR(rc, "cannot start copytool interface"); 392 | return rc; 393 | } 394 | 395 | signal(SIGINT, handler); 396 | signal(SIGTERM, handler); 397 | 398 | while (1) { 399 | struct hsm_action_list *hal; 400 | struct hsm_action_item *hai; 401 | int msgsize; 402 | int i = 0; 403 | 404 | CT_TRACE("waiting for message from kernel"); 405 | 406 | rc = llapi_hsm_copytool_recv(ctdata, &hal, &msgsize); 407 | if (rc == -ESHUTDOWN) { 408 | CT_TRACE("shutting down"); 409 | break; 410 | } 411 | else if (rc < 0) { 412 | CT_WARN("cannot receive action list: %s", 413 | strerror(-rc)); 414 | err_major++; 415 | if (ct_opt.o_abort_on_error) 416 | break; 417 | else 418 | continue; 419 | } 420 | 421 | CT_TRACE("copytool fs=%s archive#=%d item_count=%d", 422 | hal->hal_fsname, hal->hal_archive_id, hal->hal_count); 423 | 424 | if (strcmp(hal->hal_fsname, fs_name) != 0) { 425 | rc = -EINVAL; 426 | CT_ERROR(rc, "'%s' invalid fs name, expecting: %s", 427 | hal->hal_fsname, fs_name); 428 | err_major++; 429 | if (ct_opt.o_abort_on_error) 430 | break; 431 | else 432 | continue; 433 | } 434 | 435 | hai = hai_first(hal); 436 | while (++i <= hal->hal_count) { 437 | if ((char *)hai - (char *)hal > msgsize) { 438 | rc = -EPROTO; 439 | CT_ERROR(rc, 440 | "'%s' item %d past end of message!", 441 | ct_opt.o_mnt, i); 442 | err_major++; 443 | break; 444 | } 445 | rc = ct_process_item_async(hai, hal->hal_flags); 446 | if (rc < 0) 447 | CT_ERROR(rc, "'%s' item %d process", 448 | ct_opt.o_mnt, i); 449 | if (ct_opt.o_abort_on_error && err_major) 450 | break; 451 | hai = hai_next(hai); 452 | } 453 | 454 | if (ct_opt.o_abort_on_error && err_major) 455 | break; 456 | } 457 | 458 | llapi_hsm_copytool_unregister(&ctdata); 459 | if (ct_opt.o_event_fifo != NULL) 460 | llapi_hsm_unregister_event_fifo(ct_opt.o_event_fifo); 461 | 462 | return rc; 463 | } 464 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /src/lhsmtool_rados.c: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* HSM copytool program for rados (Ceph) object storage. 32 | * 33 | * An HSM copytool daemon acts on action requests from Lustre to copy files 34 | * to and from an HSM archive system. 35 | * 36 | */ 37 | 38 | #ifndef _GNU_SOURCE 39 | #define _GNU_SOURCE 40 | #endif 41 | 42 | #include "lhsmtool_rados.h" 43 | #include "ct_common.h" 44 | 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include /* To get strlcpy */ 67 | 68 | extern struct ct_options ct_opt; 69 | 70 | static void usage(const char *name, int rc) 71 | { 72 | //TODO correct the usage help for rados 73 | fprintf(stdout, 74 | " Usage: %s [options]... \n" 75 | "The Lustre HSM S3 copy tool can be used as a daemon or " 76 | "as a command line tool\n" 77 | "The Lustre HSM daemon acts on action requests from Lustre\n" 78 | "to copy files to and from an HSM archive system.\n" 79 | " --daemon Daemon mode, run in background\n" 80 | " Options:\n" 81 | "The Lustre HSM tool performs administrator-type actions\n" 82 | "on a Lustre HSM archive.\n" 83 | " --abort-on-error Abort operation on major error\n" 84 | " -A, --archive <#> Archive number (repeatable)\n" 85 | " -c, --config Path to the config file\n" 86 | " --dry-run Don't run, just show what would be done\n" 87 | " -f, --event-fifo Write events stream to fifo\n" 88 | " -q, --quiet Produce less verbose output\n" 89 | " -u, --update-interval Interval between progress reports sent\n" 90 | " to Coordinator\n" 91 | " -v, --verbose Produce more verbose output\n", 92 | cmd_name); 93 | 94 | exit(rc); 95 | } 96 | 97 | static int ct_parseopts(int argc, char * const *argv) 98 | { 99 | struct option long_opts[] = { 100 | {"abort-on-error", no_argument, &ct_opt.o_abort_on_error, 1}, 101 | {"abort_on_error", no_argument, &ct_opt.o_abort_on_error, 1}, 102 | {"archive", required_argument, NULL, 'A'}, 103 | {"config", required_argument, NULL, 'c'}, 104 | {"daemon", no_argument, &ct_opt.o_daemonize, 1}, 105 | {"event-fifo", required_argument, NULL, 'f'}, 106 | {"event_fifo", required_argument, NULL, 'f'}, 107 | {"dry-run", no_argument, &ct_opt.o_dry_run, 1}, 108 | {"help", no_argument, NULL, 'h'}, 109 | {"quiet", no_argument, NULL, 'q'}, 110 | {"rebind", no_argument, NULL, 'r'}, 111 | {"update-interval", required_argument, NULL, 'u'}, 112 | {"update_interval", required_argument, NULL, 'u'}, 113 | {"verbose", no_argument, NULL, 'v'}, 114 | {0, 0, 0, 0} 115 | }; 116 | int c, rc; 117 | config_t cfg; 118 | const char *config_str; 119 | 120 | optind = 0; 121 | while ((c = getopt_long(argc, argv, "A:b:c:f:hp:qu:v", 122 | long_opts, NULL)) != -1) { 123 | switch (c) { 124 | case 'A': 125 | if ((ct_opt.o_archive_cnt >= LL_HSM_MAX_ARCHIVE) || 126 | (atoi(optarg) >= LL_HSM_MAX_ARCHIVE)) { 127 | rc = -E2BIG; 128 | CT_ERROR(rc, "archive number must be less" 129 | "than %zu", LL_HSM_MAX_ARCHIVE); 130 | return rc; 131 | } 132 | ct_opt.o_archive_id[ct_opt.o_archive_cnt] = atoi(optarg); 133 | ct_opt.o_archive_cnt++; 134 | break; 135 | case 'b': /* -b and -c have both a number with unit as arg */ 136 | case 'c': 137 | ct_opt.o_config = optarg; 138 | break; 139 | case 'f': 140 | ct_opt.o_event_fifo = optarg; 141 | break; 142 | case 'h': 143 | usage(argv[0], 0); 144 | case 'q': 145 | ct_opt.o_verbose--; 146 | break; 147 | case 'u': 148 | ct_opt.o_report_int = atoi(optarg); 149 | if (ct_opt.o_report_int < 0) { 150 | rc = -EINVAL; 151 | CT_ERROR(rc, "bad value for -%c '%s'", c, optarg); 152 | return rc; 153 | } 154 | break; 155 | case 'v': 156 | ct_opt.o_verbose++; 157 | break; 158 | case 0: 159 | break; 160 | default: 161 | return -EINVAL; 162 | } 163 | } 164 | 165 | if (argc != optind + 1) { 166 | rc = -EINVAL; 167 | CT_ERROR(rc, "no mount point specified"); 168 | return rc; 169 | } 170 | 171 | ct_opt.o_mnt = argv[optind]; 172 | ct_opt.o_mnt_fd = -1; 173 | 174 | CT_TRACE("mount_point=%s", ct_opt.o_mnt); 175 | 176 | config_init(&cfg); 177 | if (! config_read_file(&cfg, ct_opt.o_config)) { 178 | CT_ERROR(-1, "error while reading config file\r\n%s:%d - %s", 179 | config_error_file(&cfg), 180 | config_error_line(&cfg), 181 | config_error_text(&cfg)); 182 | return -1; 183 | } 184 | 185 | if (config_lookup_int(&cfg, "chunk_size", &chunk_size)) { 186 | if (chunk_size < 0) { 187 | CT_ERROR(-1, "chunk_size cannot be negative"); 188 | return -1; 189 | } 190 | } 191 | else { 192 | CT_ERROR(-1, "could not find chunk_size"); 193 | return -1; 194 | } 195 | 196 | if (config_lookup_string(&cfg, "cluster_name", &config_str)) { 197 | strncpy(cluster_name, config_str, sizeof(cluster_name)); 198 | } 199 | else { 200 | CT_ERROR(-EINVAL, "could not find cluster_name"); 201 | return -EINVAL; 202 | } 203 | 204 | if (config_lookup_string(&cfg, "user_name", &config_str)) { 205 | strncpy(user_name, config_str, sizeof(user_name)); 206 | } 207 | else { 208 | CT_ERROR(-EINVAL, "could not find user_name"); 209 | return -EINVAL; 210 | } 211 | 212 | if (config_lookup_string(&cfg, "pool_name", &config_str)) { 213 | strncpy(pool_name, config_str, sizeof(pool_name)); 214 | } 215 | else { 216 | CT_ERROR(-EINVAL, "could not find pool_name"); 217 | return -EINVAL; 218 | } 219 | 220 | if (config_lookup_string(&cfg, "config_file", &config_str)) { 221 | strncpy(rados_config_file, config_str, sizeof(rados_config_file)); 222 | } 223 | else { 224 | CT_ERROR(-EINVAL, "could not find config_file"); 225 | return -EINVAL; 226 | } 227 | 228 | return init_rados(); 229 | } 230 | 231 | static int init_rados(){ 232 | int err; 233 | err = rados_create2(&cluster, cluster_name, user_name, 0); 234 | 235 | if (err < 0) { 236 | CT_ERROR(-EINVAL, "Couldn't create the ceph cluster handle: %s", strerror(-err)); 237 | return -EINVAL; 238 | } 239 | else{ 240 | CT_TRACE("Ceph cluster handle created"); 241 | } 242 | 243 | /* Read a Ceph configuration file to configure the cluster handle. */ 244 | err = rados_conf_read_file(cluster, rados_config_file); 245 | if (err < 0) { 246 | CT_ERROR(-EINVAL, "Couldn't read the ceph config file: %s", strerror(-err)); 247 | return -EINVAL; 248 | } 249 | 250 | err = rados_connect(cluster); 251 | if (err < 0) { 252 | CT_ERROR(-EINVAL, "Cannot connect to ceph cluster: %s", strerror(-err)); 253 | return -EINVAL; 254 | } 255 | else{ 256 | CT_TRACE("Connected to the ceph cluster"); 257 | } 258 | 259 | err = rados_ioctx_create(cluster, pool_name, &io); 260 | if (err < 0) { 261 | CT_ERROR(-EINVAL, "Cannot open rados pool %s: %s", pool_name, strerror(-err)); 262 | rados_shutdown(cluster); 263 | return -EINVAL; 264 | } 265 | else{ 266 | CT_TRACE("IO context created"); 267 | } 268 | return 0; 269 | } 270 | 271 | static int ct_archive_data(struct hsm_copyaction_private *hcp, const char *src, 272 | const char *dst, int src_fd, 273 | const struct hsm_action_item *hai, long hal_flags) 274 | { 275 | struct hsm_extent he; 276 | __u64 file_offset = hai->hai_extent.offset; 277 | struct stat src_st; 278 | char *uncompress_buf = NULL; 279 | char *compress_buf = NULL; 280 | __u64 write_total = 0; 281 | __u64 length = hai->hai_extent.length; 282 | time_t last_report_time; 283 | int rc = 0; 284 | double start_ct_now = ct_now(); 285 | time_t now; 286 | int compression_bound = LZ4_compressBound(chunk_size); 287 | 288 | // Archiving a file from Lustre to the object store 289 | CT_TRACE("Archiving %s to %s", src, dst); 290 | if (fstat(src_fd, &src_st) < 0) { 291 | rc = -errno; 292 | CT_ERROR(rc, "cannot stat '%s'", src); 293 | return rc; 294 | } 295 | 296 | if (!S_ISREG(src_st.st_mode)) { 297 | rc = -EINVAL; 298 | CT_ERROR(rc, "'%s' is not a regular file", src); 299 | return rc; 300 | } 301 | 302 | if (hai->hai_extent.offset > (__u64)src_st.st_size) { 303 | rc = -EINVAL; 304 | CT_ERROR(rc, "Trying to start reading past end ("LPU64" > " 305 | "%jd) of '%s' source file", hai->hai_extent.offset, 306 | (intmax_t)src_st.st_size, src); 307 | return rc; 308 | } 309 | 310 | strippingInfo stripping_params; 311 | stripping_params.lmm_stripe_count = 1; 312 | stripping_params.lmm_stripe_size = ONE_MB; 313 | 314 | if (ct_save_stripe(src_fd, src, &stripping_params)) { 315 | return -1; 316 | } 317 | 318 | /* Don't read beyond a given extent */ 319 | if (length > src_st.st_size - hai->hai_extent.offset) 320 | length = src_st.st_size - hai->hai_extent.offset; 321 | 322 | last_report_time = time(NULL); 323 | 324 | he.offset = file_offset; 325 | he.length = 0; 326 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 327 | if (rc < 0) { 328 | /* Action has been canceled or something wrong 329 | * is happening. Stop copying data. */ 330 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 331 | src, dst); 332 | goto out; 333 | } 334 | 335 | errno = 0; 336 | 337 | uncompress_buf = malloc(chunk_size); 338 | if (uncompress_buf == NULL) { 339 | rc = -ENOMEM; 340 | goto out; 341 | } 342 | 343 | compress_buf = malloc(compression_bound); 344 | if (compress_buf == NULL) { 345 | rc = -ENOMEM; 346 | goto out; 347 | } 348 | 349 | int chunk_id = -1; 350 | 351 | do { 352 | // Uploading to object store 353 | 354 | if (chunk_id == -1) { 355 | CT_TRACE("start copy of "LPU64" bytes from '%s' to '%s'", 356 | length, src, dst); 357 | } 358 | 359 | // size of the current chunk, limited by chunk_size 360 | long long unsigned int chunk; 361 | 362 | if (length - write_total > chunk_size) { 363 | // upper bound is the chunk_size 364 | chunk = chunk_size; 365 | } 366 | else { 367 | // limited by the file 368 | chunk = length - write_total; 369 | } 370 | 371 | chunk_id = file_offset / chunk_size; 372 | 373 | double before_lustre_read = ct_now(); 374 | pread(src_fd, uncompress_buf, chunk, file_offset); 375 | CT_TRACE("Reading a chunk from %s of %llu bytes offset %llu from lustre took %fs", 376 | src, chunk, file_offset, ct_now() - before_lustre_read); 377 | 378 | double before_compression = ct_now(); 379 | int compressed_size = LZ4_compress_default(uncompress_buf, compress_buf, chunk, compression_bound); 380 | CT_TRACE("Compressing a chunk from %s took %fs and the compressed size is %i bytes", 381 | src, ct_now() - before_compression, compressed_size); 382 | 383 | if (compressed_size <= 0) { 384 | CT_ERROR(-1, "Compression error"); 385 | rc = -1; 386 | goto out; 387 | } 388 | 389 | char dst_chunk_s[OID_LENGTH]; 390 | snprintf(dst_chunk_s, sizeof(dst_chunk_s), "%s.%i", dst, chunk_id); 391 | 392 | double before_checksum = ct_now(); 393 | unsigned char md5[MD5_DIGEST_LENGTH]; 394 | char md5_s[MD5_ASCII]; 395 | MD5_CTX mdContext; 396 | MD5_Init (&mdContext); 397 | MD5_Update (&mdContext, compress_buf, compressed_size); 398 | MD5_Final (md5, &mdContext); 399 | int i; 400 | 401 | for(i = 0; i < MD5_DIGEST_LENGTH; i++){ 402 | sprintf(&md5_s[i*2], "%02x", md5[i]); 403 | } 404 | CT_TRACE("Checksum of %s took %fs", dst_chunk_s, ct_now() - before_checksum); 405 | 406 | double before_rados_write = ct_now(); 407 | rc = rados_write_full(io, dst_chunk_s, compress_buf, compressed_size); 408 | CT_TRACE("Rados write of %s took %fs", 409 | dst_chunk_s, ct_now() - before_rados_write); 410 | 411 | if (rc < 0) { 412 | CT_ERROR(rc, "rados_write_full error"); 413 | goto out; 414 | } 415 | 416 | double before_rados_metadata_write = ct_now(); 417 | rc = rados_setxattr(io, dst_chunk_s, "md5", md5_s, MD5_ASCII); 418 | if (rc < 0) { 419 | CT_ERROR(rc, "rados_setxattr error for md5"); 420 | goto out; 421 | } 422 | 423 | const char compression_algo[] = "lz4"; 424 | rc = rados_setxattr(io, dst_chunk_s, "compression", compression_algo, 425 | sizeof(compression_algo)); 426 | if (rc < 0) { 427 | CT_ERROR(rc, "rados_setxattr error for compression"); 428 | goto out; 429 | } 430 | 431 | if(chunk_id == 0){ 432 | // store some metadata on the first object 433 | char totallength_s[TOTALLENGTH]; 434 | char chunksize_s[TOTALLENGTH]; 435 | int totallength_l = snprintf(totallength_s, sizeof(totallength_s), "%llu", length); 436 | int chunksize_l = snprintf(chunksize_s, sizeof(chunksize_s), "%i", chunk_size); 437 | 438 | rc = rados_setxattr(io, dst_chunk_s, "totallength", totallength_s, totallength_l); 439 | if (rc < 0) { 440 | CT_ERROR(rc, "rados_setxattr error for totallength"); 441 | goto out; 442 | } 443 | 444 | rc = rados_setxattr(io, dst_chunk_s, "chunksize", chunksize_s, chunksize_l); 445 | if (rc < 0) { 446 | CT_ERROR(rc, "rados_setxattr error for chunksize"); 447 | goto out; 448 | } 449 | } 450 | 451 | CT_TRACE("Rados metadata write of %s took %fs", 452 | dst_chunk_s, ct_now() - before_rados_metadata_write); 453 | 454 | he.offset = file_offset; 455 | he.length = chunk; 456 | 457 | now = time(NULL); 458 | if (now >= last_report_time + ct_opt.o_report_int) { 459 | last_report_time = now; 460 | CT_TRACE("sending progress report for archiving %s", src); 461 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 462 | if (rc < 0) { 463 | /* Action has been canceled or something wrong 464 | * is happening. Stop copying data. */ 465 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 466 | src, dst); 467 | goto out; 468 | } 469 | } 470 | 471 | write_total += chunk; 472 | file_offset += chunk; 473 | } while (file_offset < length); 474 | rc = 0; 475 | 476 | // We need to delete every chunk of higher chunk_id if they 477 | // exists, this can happen if the new file is smaller 478 | // TODO only delete objects if this is a dirty write 479 | 480 | chunk_id += 1; 481 | do { 482 | char dst_s[OID_LENGTH]; 483 | 484 | snprintf(dst_s, sizeof(dst_s), "%s.%i", dst, chunk_id); 485 | 486 | CT_TRACE("Checking if chunk %i exists", chunk_id); 487 | char buffer = 0; // read the first byte to check if the object exist 488 | rc = rados_read(io, dst_s, &buffer, 1, 0); 489 | 490 | if (rc == 1) { 491 | // Object exist, we need to delete it because the new file is smaller 492 | CT_TRACE("Deleting chunk %i", chunk_id); 493 | rc = rados_remove(io, dst_s); 494 | if(rc < 0){ 495 | CT_ERROR(rc, "Error while deleting chunk %i", chunk_id); 496 | goto out; 497 | } 498 | } 499 | else{ 500 | // a error means the object does not exist, so we don't need to delete it 501 | rc = 0; 502 | break; 503 | } 504 | 505 | chunk_id++; 506 | } while (true); 507 | 508 | out: 509 | if (uncompress_buf != NULL) 510 | free(uncompress_buf); 511 | if (compress_buf != NULL) 512 | free(compress_buf); 513 | 514 | CT_TRACE("copied "LPU64" bytes in %f seconds", 515 | length, ct_now() - start_ct_now); 516 | 517 | return rc; 518 | } 519 | 520 | static int ct_restore_data(struct hsm_copyaction_private *hcp, const char *src, 521 | const char *dst, int dst_fd, 522 | const struct hsm_action_item *hai, long hal_flags) 523 | { 524 | struct hsm_extent he; 525 | __u64 file_offset = hai->hai_extent.offset; 526 | struct stat dst_st; 527 | __u64 write_total = 0; 528 | __u64 length = hai->hai_extent.length; 529 | time_t last_report_time; 530 | time_t now; 531 | int rc = 0; 532 | double start_ct_now = ct_now(); 533 | 534 | // Restore a file from the object store back to Lustre 535 | 536 | CT_TRACE("Restoring %s to %s", src, dst); 537 | if (fstat(dst_fd, &dst_st) < 0) { 538 | rc = -errno; 539 | CT_ERROR(rc, "cannot stat '%s'", dst); 540 | return rc; 541 | } 542 | 543 | if (!S_ISREG(dst_st.st_mode)) { 544 | rc = -EINVAL; 545 | CT_ERROR(rc, "'%s' is not a regular file", dst); 546 | return rc; 547 | } 548 | 549 | he.offset = file_offset; 550 | he.length = 0; 551 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 552 | if (rc < 0) { 553 | /* Action has been canceled or something wrong 554 | * is happening. Stop copying data. */ 555 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 556 | src, dst); 557 | goto out; 558 | } 559 | 560 | errno = 0; 561 | 562 | last_report_time = time(NULL); 563 | 564 | // Metadata from the first chunk 565 | char src_chunk_s[OID_LENGTH]; 566 | snprintf(src_chunk_s, sizeof(src_chunk_s), "%s.0", src); 567 | char* err; 568 | 569 | char totallength_s[TOTALLENGTH]; 570 | char chunksize_s[TOTALLENGTH]; 571 | 572 | int totallength_l = rados_getxattr(io, src_chunk_s, "totallength", totallength_s, sizeof(totallength_s)); 573 | if(totallength_l < 0){ 574 | rc = -1; 575 | goto out; 576 | } 577 | totallength_s[totallength_l] = '\0'; 578 | length = strtoll(totallength_s, &err, 10); 579 | if(*err){ 580 | printf("Error while parsing totallength, non-covertible part: %s\r\n", err); 581 | rc = -1; 582 | goto out; 583 | } 584 | 585 | int chunksize_l = rados_getxattr(io, src_chunk_s, "chunksize", chunksize_s, sizeof(chunksize_s)); 586 | if(chunksize_l < 0){ 587 | rc = -1; 588 | goto out; 589 | } 590 | chunksize_s[chunksize_l] = '\0'; 591 | int object_chunk_size = strtoll(chunksize_s, &err, 10); 592 | if(*err){ 593 | printf("Error while parsing chunksize, non-covertible part: %s\r\n", err); 594 | rc = -1; 595 | goto out; 596 | } 597 | 598 | do { 599 | // Downloading from the object store 600 | snprintf(src_chunk_s, sizeof(src_chunk_s), "%s.%llu", src, file_offset / object_chunk_size); 601 | 602 | char *compress_buf = NULL; 603 | int maximum_compressed_size = LZ4_compressBound(object_chunk_size); 604 | compress_buf = malloc(maximum_compressed_size); 605 | if (compress_buf == NULL) { 606 | rc = -ENOMEM; 607 | goto out; 608 | } 609 | 610 | double before_rados_read = ct_now(); 611 | int compressed_size = rados_read(io, src_chunk_s, compress_buf, maximum_compressed_size, 0); 612 | CT_TRACE("Rados read of %s took %fs", src_chunk_s, ct_now() - before_rados_read); 613 | 614 | char *uncompress_buf = NULL; 615 | uncompress_buf = malloc(object_chunk_size); 616 | if (uncompress_buf == NULL) { 617 | rc = -ENOMEM; 618 | goto out; 619 | } 620 | 621 | double before_checksum = ct_now(); 622 | unsigned char md5[MD5_DIGEST_LENGTH]; 623 | char md5_xattr[MD5_ASCII]; 624 | char md5_computed[MD5_ASCII]; 625 | MD5_CTX mdContext; 626 | MD5_Init (&mdContext); 627 | MD5_Update (&mdContext, compress_buf, compressed_size); 628 | MD5_Final (md5, &mdContext); 629 | int i; 630 | 631 | for(i = 0; i < MD5_DIGEST_LENGTH; i++){ 632 | sprintf(&md5_computed[i*2], "%02x", md5[i]); 633 | } 634 | CT_TRACE("Checksum of %s took %fs", src_chunk_s, ct_now() - before_checksum); 635 | 636 | int md5_l = rados_getxattr(io, src_chunk_s, "md5", md5_xattr, sizeof(md5_xattr)); 637 | if(md5_l < 0){ 638 | rc = -1; 639 | goto out; 640 | } 641 | 642 | if(strcmp(md5_computed, md5_xattr) != 0){ 643 | CT_ERROR(-EIO, "Bad MD5 checksum for %s, computed %s, expected %s", 644 | src_chunk_s, md5_computed, md5_xattr); 645 | return -EIO; 646 | } 647 | 648 | double before_decompression = ct_now(); 649 | int decompressed_size = LZ4_decompress_safe(compress_buf, uncompress_buf, compressed_size, object_chunk_size); 650 | CT_TRACE("Decompressing a chunk from %s of %i bytes took %fs and the uncompressed size is %i bytes", 651 | src, compressed_size, ct_now() - before_decompression, decompressed_size); 652 | if (decompressed_size < 0) { 653 | CT_ERROR(-1, "Decompression error"); 654 | rc = -1; 655 | goto out; 656 | } 657 | 658 | double before_lustre_write = ct_now(); 659 | pwrite(dst_fd, uncompress_buf, decompressed_size, file_offset); 660 | CT_TRACE("Writing a chunk from %s of %i bytes offset %llu to lustre took %fs", 661 | src_chunk_s, object_chunk_size, file_offset, ct_now() - before_lustre_write); 662 | 663 | if (uncompress_buf != NULL) 664 | free(uncompress_buf); 665 | if (compress_buf != NULL) 666 | free(compress_buf); 667 | 668 | write_total += decompressed_size; 669 | file_offset += decompressed_size; 670 | he.offset = file_offset; 671 | he.length = decompressed_size; 672 | 673 | now = time(NULL); 674 | if (now >= last_report_time + ct_opt.o_report_int) { 675 | last_report_time = now; 676 | CT_TRACE("sending progress report for restoring %s", src); 677 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 678 | if (rc < 0) { 679 | /* Action has been canceled or something wrong 680 | * is happening. Stop copying data. */ 681 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 682 | src, dst); 683 | goto out; 684 | } 685 | } 686 | rc = 0; 687 | } while (file_offset < length); 688 | 689 | if (hai->hai_action == HSMA_RESTORE) { 690 | /* 691 | * truncate restored file 692 | * size is taken from the archive this is done to support 693 | * restore after a force release which leaves the file with the 694 | * wrong size (can big bigger than the new size) 695 | * make sure the file is on disk before reporting success. 696 | */ 697 | rc = ftruncate(dst_fd, length); 698 | if (rc < 0) { 699 | rc = -errno; 700 | CT_ERROR(rc, "cannot truncate '%s' to size %llu", 701 | dst, length); 702 | err_major++; 703 | } 704 | } 705 | 706 | out: 707 | CT_TRACE("copied "LPU64" bytes in %f seconds", 708 | length, ct_now() - start_ct_now); 709 | 710 | return rc; 711 | } 712 | 713 | int ct_archive(const struct hsm_action_item *hai, const long hal_flags) 714 | { 715 | struct hsm_copyaction_private *hcp = NULL; 716 | char src[PATH_MAX]; 717 | char dst[PATH_MAX] = ""; 718 | int rc; 719 | int rcf = 0; 720 | int hp_flags = 0; 721 | int open_flags; 722 | int src_fd = -1; 723 | 724 | rc = ct_begin(&hcp, hai); 725 | if (rc < 0) 726 | goto end_ct_archive; 727 | 728 | /* we fill archive so: 729 | * source = data FID 730 | * destination = lustre FID 731 | */ 732 | ct_path_lustre(src, sizeof(src), ct_opt.o_mnt, &hai->hai_dfid); 733 | ct_path_archive(dst, sizeof(dst), &hai->hai_fid); 734 | 735 | CT_TRACE("archiving '%s' to '%s'", src, dst); 736 | 737 | if (ct_opt.o_dry_run) { 738 | rc = 0; 739 | goto end_ct_archive; 740 | } 741 | 742 | src_fd = llapi_hsm_action_get_fd(hcp); 743 | if (src_fd < 0) { 744 | rc = src_fd; 745 | CT_ERROR(rc, "cannot open '%s' for read", src); 746 | goto end_ct_archive; 747 | } 748 | 749 | open_flags = O_WRONLY | O_NOFOLLOW; 750 | /* If extent is specified, don't truncate an old archived copy */ 751 | open_flags |= ((hai->hai_extent.length == -1) ? O_TRUNC : 0) | O_CREAT; 752 | 753 | rc = ct_archive_data(hcp, src, dst, src_fd, hai, hal_flags); 754 | if (rc < 0) { 755 | CT_ERROR(rc, "data copy failed from '%s' to '%s'", src, dst); 756 | goto end_ct_archive; 757 | } 758 | 759 | CT_TRACE("data archiving for '%s' to '%s' done", src, dst); 760 | 761 | end_ct_archive: 762 | err_major++; 763 | 764 | unlink(dst); 765 | if (ct_is_retryable(rc)) 766 | hp_flags |= HP_FLAG_RETRY; 767 | 768 | rcf = rc; 769 | 770 | if (!(src_fd < 0)) 771 | close(src_fd); 772 | 773 | rc = ct_action_done(&hcp, hai, hp_flags, rcf); 774 | 775 | return rc; 776 | } 777 | 778 | int ct_restore(const struct hsm_action_item *hai, const long hal_flags) 779 | { 780 | struct hsm_copyaction_private *hcp = NULL; 781 | struct lu_fid dfid; 782 | char src[PATH_MAX]; 783 | char dst[PATH_MAX]; 784 | int rc; 785 | int hp_flags = 0; 786 | int dst_fd = -1; 787 | int mdt_index = -1; 788 | int open_flags = 0; 789 | /* we fill lustre so: 790 | * source = lustre FID in the backend 791 | * destination = data FID = volatile file 792 | */ 793 | 794 | /* build backend file name from released file FID */ 795 | ct_path_archive(src, sizeof(src), &hai->hai_fid); 796 | 797 | rc = llapi_get_mdt_index_by_fid(ct_opt.o_mnt_fd, &hai->hai_fid, 798 | &mdt_index); 799 | if (rc < 0) { 800 | CT_ERROR(rc, "cannot get mdt index "DFID"", 801 | PFID(&hai->hai_fid)); 802 | return rc; 803 | } 804 | 805 | rc = ct_begin_restore(&hcp, hai, mdt_index, open_flags); 806 | if (rc < 0) 807 | goto end_ct_restore; 808 | 809 | /* get the FID of the volatile file */ 810 | rc = llapi_hsm_action_get_dfid(hcp, &dfid); 811 | if (rc < 0) { 812 | CT_ERROR(rc, "restoring "DFID 813 | ", cannot get FID of created volatile file", 814 | PFID(&hai->hai_fid)); 815 | goto end_ct_restore; 816 | } 817 | 818 | /* build volatile "file name", for messages */ 819 | snprintf(dst, sizeof(dst), "{VOLATILE}="DFID, PFID(&dfid)); 820 | 821 | CT_TRACE("restoring data from '%s' to '%s'", src, dst); 822 | 823 | if (ct_opt.o_dry_run) { 824 | rc = 0; 825 | goto end_ct_restore; 826 | } 827 | 828 | dst_fd = llapi_hsm_action_get_fd(hcp); 829 | if (dst_fd < 0) { 830 | rc = dst_fd; 831 | CT_ERROR(rc, "cannot open '%s' for write", dst); 832 | goto end_ct_restore; 833 | } 834 | 835 | rc = ct_restore_data(hcp, src, dst, dst_fd, hai, hal_flags); 836 | if (rc < 0) { 837 | CT_ERROR(rc, "cannot copy data from '%s' to '%s'", 838 | src, dst); 839 | err_major++; 840 | if (ct_is_retryable(rc)) 841 | hp_flags |= HP_FLAG_RETRY; 842 | goto end_ct_restore; 843 | } 844 | 845 | CT_TRACE("data restore from '%s' to '%s' done", src, dst); 846 | 847 | end_ct_restore: 848 | rc = ct_action_done(&hcp, hai, hp_flags, rc); 849 | 850 | /* object swaping is done by cdt at copy end, so close of volatile file 851 | * cannot be done before */ 852 | 853 | if (!(dst_fd < 0)) 854 | close(dst_fd); 855 | 856 | return rc; 857 | } 858 | 859 | int ct_remove(const struct hsm_action_item *hai, const long hal_flags) 860 | { 861 | struct hsm_copyaction_private *hcp = NULL; 862 | char dst[PATH_MAX]; 863 | int rc; 864 | char dst_s[OID_LENGTH]; 865 | 866 | rc = ct_begin(&hcp, hai); 867 | if (rc < 0) 868 | goto end_ct_remove; 869 | 870 | ct_path_archive(dst, sizeof(dst), &hai->hai_fid); 871 | 872 | CT_TRACE("removing file '%s'", dst); 873 | 874 | if (ct_opt.o_dry_run) { 875 | rc = 0; 876 | goto end_ct_remove; 877 | } 878 | 879 | // Metadata from the first chunk 880 | snprintf(dst_s, sizeof(dst_s), "%s.0", dst); 881 | char* err; 882 | 883 | char totallength_s[TOTALLENGTH]; 884 | char chunksize_s[TOTALLENGTH]; 885 | 886 | int totallength_l = rados_getxattr(io, dst_s, "totallength", totallength_s, sizeof(totallength_s)); 887 | if(totallength_l < 0){ 888 | rc = -1; 889 | goto end_ct_remove; 890 | } 891 | totallength_s[totallength_l] = '\0'; 892 | __u64 length = strtoll(totallength_s, &err, 10); 893 | if(*err){ 894 | printf("Error while parsing totallength, non-covertible part: %s\r\n", err); 895 | rc = -1; 896 | goto end_ct_remove; 897 | } 898 | 899 | int chunksize_l = rados_getxattr(io, dst_s, "chunksize", chunksize_s, sizeof(chunksize_s)); 900 | if(chunksize_l < 0){ 901 | rc = -1; 902 | goto end_ct_remove; 903 | } 904 | chunksize_s[chunksize_l] = '\0'; 905 | int object_chunk_size = strtoll(chunksize_s, &err, 10); 906 | if(*err){ 907 | printf("Error while parsing totallength, non-covertible part: %s\r\n", err); 908 | rc = -1; 909 | goto end_ct_remove; 910 | } 911 | 912 | int chunk; 913 | for (chunk = length / object_chunk_size; chunk >= 0; chunk--) { 914 | snprintf(dst_s, sizeof(dst_s), "%s.%i", dst, chunk); 915 | 916 | CT_TRACE("Deleting chunk '%s'", dst_s); 917 | rados_remove(io, dst_s); 918 | } 919 | rc = 0; 920 | 921 | end_ct_remove: 922 | rc = ct_action_done(&hcp, hai, 0, rc); 923 | 924 | return rc; 925 | } 926 | 927 | int ct_cancel(const struct hsm_action_item *hai, const long hal_flags) 928 | { 929 | CT_TRACE("cancel not implemented for file system '%s'", ct_opt.o_mnt); 930 | /* Don't report progress to coordinator for this cookie: 931 | * the copy function will get ECANCELED when reporting 932 | * progress. */ 933 | return 0; 934 | } 935 | 936 | static int ct_rados_cleanup(void) 937 | { 938 | int rc = 0; 939 | 940 | rc = ct_cleanup(); 941 | if (rc == 0) { 942 | rados_shutdown(cluster); 943 | } 944 | 945 | return rc; 946 | } 947 | 948 | int main(int argc, char **argv) 949 | { 950 | int rc; 951 | 952 | strlcpy(cmd_name, basename(argv[0]), sizeof(cmd_name)); 953 | rc = ct_parseopts(argc, argv); 954 | if (rc < 0) { 955 | CT_WARN("try '%s --help' for more information", cmd_name); 956 | return -rc; 957 | } 958 | 959 | rc = ct_setup(); 960 | if (rc < 0) 961 | goto error_cleanup; 962 | 963 | rc = ct_run(); 964 | 965 | error_cleanup: 966 | ct_rados_cleanup(); 967 | 968 | return -rc; 969 | } 970 | 971 | -------------------------------------------------------------------------------- /src/lhsmtool_s3.c: -------------------------------------------------------------------------------- 1 | /* 2 | * GPL HEADER START 3 | * 4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License version 2 only, 8 | * as published by the Free Software Foundation. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License version 2 for more details (a copy is included 14 | * in the LICENSE file that accompanied this code). 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * version 2 along with this program; If not, see 18 | * http://www.gnu.org/licenses/gpl-2.0.htm 19 | * 20 | * GPL HEADER END 21 | */ 22 | /* 23 | * Copyright (c) 2015, 2016, Universite Laval 24 | * Authors: Simon Guilbault, Frederick Lefebvre 25 | * 26 | * 27 | * Part of this file include code from file lhsmtool_posix.c (licensed under 28 | * a GPLv2 license) that can be found in Lustre's git repository here : 29 | * git://git.hpdd.intel.com/fs/lustre-release.git 30 | */ 31 | /* HSM copytool program for S3 object storage. 32 | * 33 | * An HSM copytool daemon acts on action requests from Lustre to copy files 34 | * to and from an HSM archive system. 35 | * 36 | */ 37 | 38 | #ifndef _GNU_SOURCE 39 | #define _GNU_SOURCE 40 | #endif 41 | 42 | #include "lhsmtool_s3.h" 43 | #include "ct_common.h" 44 | 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | #include /* To get strlcpy */ 68 | 69 | /* everything else is zeroed */ 70 | //struct ct_s3_options ct_s3_opt = { 71 | // .o_verbose = LLAPI_MSG_INFO, 72 | // .o_report_int = REPORT_INTERVAL_DEFAULT, 73 | // .o_config = "config.cfg", 74 | //}; 75 | extern struct ct_options ct_opt; 76 | 77 | S3Status responsePropertiesCallback(const S3ResponseProperties *properties, 78 | void *callbackData) 79 | { 80 | uint i; 81 | get_object_callback_data *data = (get_object_callback_data *) callbackData; 82 | 83 | assert(data && properties); 84 | 85 | data->contentLength = properties->contentLength; 86 | 87 | if(properties->eTag){ 88 | strncpy(data->md5, properties->eTag+1, MD5_ASCII - 1); 89 | } 90 | 91 | for (i = 0; i < properties->metaDataCount; i++) { 92 | if (strcmp(properties->metaData[i].name, "chunksize") == 0) { 93 | char* err; 94 | data->chunk_size = strtoll(properties->metaData[i].value, &err, 10); 95 | if(*err){ 96 | printf("Error while parsing chunk_size, non-covertible part: %s\r\n", err); 97 | return S3StatusAbortedByCallback; 98 | } 99 | } 100 | else if (strcmp(properties->metaData[i].name, "totallength") == 0) { 101 | char* err; 102 | data->totalLength = strtoll(properties->metaData[i].value, &err, 10); 103 | if(*err){ 104 | printf("Error while parsing totalLength, non-covertible part: %s\r\n", err); 105 | return S3StatusAbortedByCallback; 106 | } 107 | } 108 | } 109 | return S3StatusOK; 110 | } 111 | 112 | static void getResponseCompleteCallback(S3Status status, 113 | const S3ErrorDetails *error, 114 | void *callbackData) 115 | { 116 | get_object_callback_data *data = (get_object_callback_data *) callbackData; 117 | assert(data); 118 | data->status = status; 119 | return; 120 | } 121 | 122 | static void putResponseCompleteCallback(S3Status status, 123 | const S3ErrorDetails *error, 124 | void *callbackData) 125 | { 126 | put_object_callback_data *data = (put_object_callback_data *) callbackData; 127 | assert(data); 128 | data->status = status; 129 | return; 130 | } 131 | 132 | static int putObjectDataCallback(int bufferSize, char *buffer, 133 | void *callbackData) 134 | { 135 | put_object_callback_data *data = (put_object_callback_data *) callbackData; 136 | int size = 0; 137 | 138 | assert(data && buffer); 139 | 140 | if (data->contentLength) { 141 | if (data->contentLength > bufferSize) { 142 | // Limited by bufferSize 143 | size = bufferSize; 144 | } 145 | else { 146 | // Last chunk 147 | size = data->contentLength; 148 | } 149 | memcpy(buffer, data->buffer + data->buffer_offset, size); 150 | data->buffer_offset += size; 151 | } 152 | 153 | data->contentLength -= size; 154 | return size; 155 | } 156 | 157 | static S3Status getObjectDataCallback(int bufferSize, const char *buffer, 158 | void *callbackData) 159 | { 160 | get_object_callback_data *data = (get_object_callback_data *) callbackData; 161 | int size = 0; 162 | 163 | assert(data && buffer); 164 | 165 | if (data->contentLength) { 166 | if (data->buffer == NULL) { 167 | // Allocate memory for the buffer 168 | data->buffer = malloc(data->contentLength); 169 | if (data->buffer == NULL) { 170 | // Allocation error 171 | return S3StatusAbortedByCallback; 172 | } 173 | } 174 | 175 | if (data->contentLength > bufferSize) { 176 | // Limited by bufferSize 177 | size = bufferSize; 178 | } 179 | else { 180 | // Last chunk 181 | size = data->contentLength; 182 | } 183 | memcpy(data->buffer + data->buffer_offset, buffer, size); 184 | data->buffer_offset += size; 185 | } 186 | return ((size < (size_t) bufferSize) ? S3StatusAbortedByCallback : S3StatusOK); 187 | } 188 | 189 | static void getBucketName(int bucketNameSize, 190 | char *bucketName, 191 | char *objectName) 192 | { 193 | /* This function will hash the object name and modify bucketName to 194 | * point the request to the correct bucket. The number of shard is defined 195 | * with bucket_count */ 196 | if (bucket_count > 1) { 197 | unsigned long bucket_id = hash(objectName) % bucket_count; 198 | snprintf(bucketName, bucketNameSize, "%s_%lu", bucket_prefix, bucket_id); 199 | } 200 | else { 201 | // do not add a numbered suffix to the bucket name 202 | snprintf(bucketName, bucketNameSize, "%s", bucket_prefix); 203 | } 204 | } 205 | 206 | static int get_s3_object(char *objectName, 207 | get_object_callback_data *data, 208 | S3GetObjectHandler *getObjectHandler){ 209 | 210 | assert(objectName && data && getObjectHandler); 211 | memset(data, 0, sizeof(get_object_callback_data)); 212 | 213 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 214 | getBucketName(sizeof(bucket_name), bucket_name, objectName); 215 | 216 | // Get a local copy of the general bucketContext than overwrite the 217 | // pointer to the bucket_name 218 | S3BucketContext localbucketContext; 219 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 220 | localbucketContext.bucketName = bucket_name; 221 | 222 | data->buffer_offset = 0; 223 | data->buffer = NULL; 224 | 225 | double before_s3_get = ct_now(); 226 | int retry_count = RETRYCOUNT; 227 | 228 | do { 229 | S3_get_object(&localbucketContext, objectName, NULL, 0, 0, NULL, getObjectHandler, data); 230 | } while (S3_status_is_retryable(data->status) && should_retry(&retry_count)); 231 | 232 | CT_TRACE("S3 get of %s took %fs", objectName, ct_now() - before_s3_get); 233 | 234 | if (data->buffer == NULL) { 235 | return -ENOMEM; 236 | } 237 | if (data->status != S3StatusOK) { 238 | CT_ERROR(-EIO, "S3Error %s", S3_get_status_name(data->status)); 239 | return -EIO; 240 | } 241 | 242 | double before_checksum = ct_now(); 243 | unsigned char md5[MD5_DIGEST_LENGTH]; 244 | char md5_s[MD5_ASCII]; 245 | MD5_CTX mdContext; 246 | MD5_Init (&mdContext); 247 | MD5_Update (&mdContext, data->buffer, data->contentLength); 248 | MD5_Final (md5, &mdContext); 249 | int i; 250 | 251 | for(i = 0; i < MD5_DIGEST_LENGTH; i++){ 252 | sprintf(&md5_s[i*2], "%02x", md5[i]); 253 | } 254 | 255 | if(strcmp(md5_s, data->md5) != 0){ 256 | CT_ERROR(-EIO, "Bad MD5 checksum for %s, computed %s, expected %s", 257 | objectName, md5_s, data->md5); 258 | return -EIO; 259 | } 260 | CT_TRACE("Checksum of %s took %fs", objectName, ct_now() - before_checksum); 261 | return 0; 262 | } 263 | 264 | static void usage(const char *name, int rc) 265 | { 266 | //TODO correct the usage help for s3 267 | fprintf(stdout, 268 | " Usage: %s [options]... \n" 269 | "The Lustre HSM S3 copy tool can be used as a daemon or " 270 | "as a command line tool\n" 271 | "The Lustre HSM daemon acts on action requests from Lustre\n" 272 | "to copy files to and from an HSM archive system.\n" 273 | " --daemon Daemon mode, run in background\n" 274 | " Options:\n" 275 | "The Lustre HSM tool performs administrator-type actions\n" 276 | "on a Lustre HSM archive.\n" 277 | " --abort-on-error Abort operation on major error\n" 278 | " -A, --archive <#> Archive number (repeatable)\n" 279 | " -c, --config Path to the config file\n" 280 | " --dry-run Don't run, just show what would be done\n" 281 | " -f, --event-fifo Write events stream to fifo\n" 282 | " -q, --quiet Produce less verbose output\n" 283 | " -u, --update-interval Interval between progress reports sent\n" 284 | " to Coordinator\n" 285 | " -v, --verbose Produce more verbose output\n", 286 | cmd_name); 287 | 288 | exit(rc); 289 | } 290 | 291 | static int ct_parseopts(int argc, char * const *argv) 292 | { 293 | struct option long_opts[] = { 294 | {"abort-on-error", no_argument, &ct_opt.o_abort_on_error, 1}, 295 | {"abort_on_error", no_argument, &ct_opt.o_abort_on_error, 1}, 296 | {"archive", required_argument, NULL, 'A'}, 297 | {"config", required_argument, NULL, 'c'}, 298 | {"daemon", no_argument, &ct_opt.o_daemonize, 1}, 299 | {"event-fifo", required_argument, NULL, 'f'}, 300 | {"event_fifo", required_argument, NULL, 'f'}, 301 | {"dry-run", no_argument, &ct_opt.o_dry_run, 1}, 302 | {"help", no_argument, NULL, 'h'}, 303 | {"quiet", no_argument, NULL, 'q'}, 304 | {"rebind", no_argument, NULL, 'r'}, 305 | {"update-interval", required_argument, NULL, 'u'}, 306 | {"update_interval", required_argument, NULL, 'u'}, 307 | {"verbose", no_argument, NULL, 'v'}, 308 | {0, 0, 0, 0} 309 | }; 310 | int c, rc; 311 | config_t cfg; 312 | const char *config_str; 313 | 314 | optind = 0; 315 | while ((c = getopt_long(argc, argv, "A:b:c:f:hp:qu:v", 316 | long_opts, NULL)) != -1) { 317 | switch (c) { 318 | case 'A': 319 | if ((ct_opt.o_archive_cnt >= LL_HSM_MAX_ARCHIVE) || 320 | (atoi(optarg) >= LL_HSM_MAX_ARCHIVE)) { 321 | rc = -E2BIG; 322 | CT_ERROR(rc, "archive number must be less" 323 | "than %zu", LL_HSM_MAX_ARCHIVE); 324 | return rc; 325 | } 326 | ct_opt.o_archive_id[ct_opt.o_archive_cnt] = atoi(optarg); 327 | ct_opt.o_archive_cnt++; 328 | break; 329 | case 'b': /* -b and -c have both a number with unit as arg */ 330 | case 'c': 331 | ct_opt.o_config = optarg; 332 | break; 333 | case 'f': 334 | ct_opt.o_event_fifo = optarg; 335 | break; 336 | case 'h': 337 | usage(argv[0], 0); 338 | case 'q': 339 | ct_opt.o_verbose--; 340 | break; 341 | case 'u': 342 | ct_opt.o_report_int = atoi(optarg); 343 | if (ct_opt.o_report_int < 0) { 344 | rc = -EINVAL; 345 | CT_ERROR(rc, "bad value for -%c '%s'", c, optarg); 346 | return rc; 347 | } 348 | break; 349 | case 'v': 350 | ++ct_opt.o_verbose; 351 | break; 352 | case 0: 353 | break; 354 | default: 355 | return -EINVAL; 356 | } 357 | } 358 | 359 | if (argc != optind + 1) { 360 | rc = -EINVAL; 361 | CT_ERROR(rc, "no mount point specified"); 362 | return rc; 363 | } 364 | 365 | ct_opt.o_mnt = argv[optind]; 366 | ct_opt.o_mnt_fd = -1; 367 | 368 | CT_TRACE("mount_point=%s", ct_opt.o_mnt); 369 | 370 | config_init(&cfg); 371 | if (! config_read_file(&cfg, ct_opt.o_config)) { 372 | CT_ERROR(-EINVAL, "error while reading config file\r\n%s:%d - %s", 373 | config_error_file(&cfg), 374 | config_error_line(&cfg), 375 | config_error_text(&cfg)); 376 | return -EINVAL; 377 | } 378 | 379 | if (config_lookup_string(&cfg, "access_key", &config_str)) { 380 | strncpy(access_key, config_str, sizeof(access_key)); 381 | } 382 | else { 383 | CT_ERROR(-EINVAL, "could not find access_key"); 384 | return -EINVAL; 385 | } 386 | 387 | if (config_lookup_string(&cfg, "secret_key", &config_str)) { 388 | strncpy(secret_key, config_str, sizeof(secret_key)); 389 | } 390 | else { 391 | CT_ERROR(-EINVAL, "could not find secret_key"); 392 | return -EINVAL; 393 | } 394 | 395 | if (config_lookup_string(&cfg, "host", &config_str)) { 396 | strncpy(host, config_str, sizeof(host)); 397 | } 398 | else { 399 | CT_ERROR(-EINVAL, "could not find host"); 400 | return -EINVAL; 401 | } 402 | 403 | if (config_lookup_string(&cfg, "bucket_prefix", &config_str)) { 404 | strncpy(bucket_prefix, config_str, sizeof(host)); 405 | } 406 | else { 407 | CT_ERROR(-EINVAL, "could not find bucket_prefix"); 408 | return -EINVAL; 409 | } 410 | 411 | if (config_lookup_int(&cfg, "chunk_size", &chunk_size)) { 412 | if (chunk_size < 0) { 413 | CT_ERROR(-EINVAL, "chunk_size cannot be negative"); 414 | return -EINVAL; 415 | } 416 | } 417 | else { 418 | CT_ERROR(-EINVAL, "could not find chunk_size"); 419 | return -EINVAL; 420 | } 421 | 422 | if (config_lookup_int(&cfg, "bucket_count", &bucket_count)) { 423 | if (bucket_count < 0) { 424 | CT_ERROR(-EINVAL, "bucket_count cannot be negative"); 425 | return -EINVAL; 426 | } 427 | } 428 | else { 429 | CT_ERROR(-EINVAL, "could not find bucket_count"); 430 | return -EINVAL; 431 | } 432 | 433 | int ssl_enabled; 434 | if (config_lookup_bool(&cfg, "ssl", &ssl_enabled)){ 435 | if(ssl_enabled){ 436 | bucketContext.protocol = S3ProtocolHTTPS; 437 | } 438 | else{ 439 | bucketContext.protocol = S3ProtocolHTTP; 440 | } 441 | } 442 | else{ 443 | CT_ERROR(-EINVAL, "could not find ssl"); 444 | return -EINVAL; 445 | } 446 | 447 | return 0; 448 | } 449 | 450 | static int ct_archive_data(struct hsm_copyaction_private *hcp, const char *src, 451 | const char *dst, int src_fd, 452 | const struct hsm_action_item *hai, long hal_flags) 453 | { 454 | struct hsm_extent he; 455 | __u64 file_offset = hai->hai_extent.offset; 456 | struct stat src_st; 457 | char *uncompress_buf = NULL; 458 | char *compress_buf = NULL; 459 | __u64 write_total = 0; 460 | __u64 length = hai->hai_extent.length; 461 | time_t last_report_time; 462 | int rc = 0; 463 | double start_ct_now = ct_now(); 464 | time_t now; 465 | int compression_bound = LZ4_compressBound(chunk_size); 466 | 467 | // Archiving a file from Lustre to the object store 468 | CT_TRACE("Archiving %s to %s", src, dst); 469 | if (fstat(src_fd, &src_st) < 0) { 470 | rc = -errno; 471 | CT_ERROR(rc, "cannot stat '%s'", src); 472 | return rc; 473 | } 474 | 475 | if (!S_ISREG(src_st.st_mode)) { 476 | rc = -EINVAL; 477 | CT_ERROR(rc, "'%s' is not a regular file", src); 478 | return rc; 479 | } 480 | 481 | if (hai->hai_extent.offset > (__u64)src_st.st_size) { 482 | rc = -EINVAL; 483 | CT_ERROR(rc, "Trying to start reading past end ("LPU64" > " 484 | "%jd) of '%s' source file", hai->hai_extent.offset, 485 | (intmax_t)src_st.st_size, src); 486 | return rc; 487 | } 488 | 489 | strippingInfo stripping_params; 490 | stripping_params.lmm_stripe_count = 1; 491 | stripping_params.lmm_stripe_size = ONE_MB; 492 | 493 | if (ct_save_stripe(src_fd, src, &stripping_params)) { 494 | return -1; 495 | } 496 | 497 | /* Don't read beyond a given extent */ 498 | if (length > src_st.st_size - hai->hai_extent.offset) 499 | length = src_st.st_size - hai->hai_extent.offset; 500 | 501 | last_report_time = time(NULL); 502 | 503 | he.offset = file_offset; 504 | he.length = 0; 505 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 506 | if (rc < 0) { 507 | /* Action has been canceled or something wrong 508 | * is happening. Stop copying data. */ 509 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 510 | src, dst); 511 | goto out; 512 | } 513 | 514 | errno = 0; 515 | 516 | uncompress_buf = malloc(chunk_size); 517 | if (uncompress_buf == NULL) { 518 | rc = -ENOMEM; 519 | goto out; 520 | } 521 | 522 | compress_buf = malloc(compression_bound); 523 | if (compress_buf == NULL) { 524 | rc = -ENOMEM; 525 | goto out; 526 | } 527 | 528 | int chunk_id = -1; 529 | 530 | const char totalLength[] = "totallength"; 531 | const char chunksize[] = "chunksize"; 532 | const char stripe_size[] = "stripesize"; 533 | const char stripe_count[] = "stripecount"; 534 | const char path[] = "path"; 535 | const char uid[] = "uid"; 536 | const char gid[] = "gid"; 537 | 538 | char totalLength_s[TOTALLENGTH]; 539 | char chunksize_s[TOTALLENGTH]; 540 | char stripe_size_s[TOTALLENGTH]; 541 | char stripe_count_s[TOTALLENGTH]; 542 | char path_s[PATH_MAX]; 543 | char uid_s[TOTALLENGTH]; 544 | char gid_s[TOTALLENGTH]; 545 | 546 | snprintf(totalLength_s, sizeof(totalLength_s), "%llu", length); 547 | snprintf(chunksize_s, sizeof(chunksize_s), "%i", chunk_size); 548 | snprintf(stripe_size_s, sizeof(stripe_size_s), "%i", stripping_params.lmm_stripe_size); 549 | snprintf(stripe_count_s, sizeof(stripe_count_s), "%i", stripping_params.lmm_stripe_count); 550 | snprintf(path_s, sizeof(path_s), "%s", src); // FIXME should use fid2path to get the normal path 551 | snprintf(uid_s, sizeof(uid_s), "%i", src_st.st_uid); 552 | snprintf(gid_s, sizeof(gid_s), "%i", src_st.st_gid); 553 | 554 | // Saving some metadata for disaster recovery 555 | S3NameValue metadata[7] = 556 | { 557 | { 558 | totalLength, 559 | totalLength_s, 560 | }, 561 | { 562 | chunksize, 563 | chunksize_s, 564 | }, 565 | { 566 | stripe_size, 567 | stripe_size_s, 568 | }, 569 | { 570 | stripe_count, 571 | stripe_count_s, 572 | }, 573 | { 574 | path, 575 | path_s 576 | }, 577 | { 578 | uid, 579 | uid_s 580 | }, 581 | { 582 | gid, 583 | gid_s 584 | } 585 | }; 586 | 587 | S3PutProperties putProperties = 588 | { 589 | // application/x-lz4 does not officially exist 590 | "application/x-lz4", // contentType 591 | NULL, // md5 592 | NULL, // cacheControl 593 | NULL, // contentDispositionFilename 594 | NULL, // contentEncoding 595 | -1, // expires 596 | 0, // cannedAcl 597 | sizeof(metadata) / sizeof(S3NameValue), // metaDataCount 598 | metadata, // S3NameValue *metaData 599 | 0, // useServerSideEncryption 600 | }; 601 | 602 | do { 603 | // Uploading to object store 604 | 605 | if (chunk_id == -1) { 606 | CT_TRACE("start copy of "LPU64" bytes from '%s' to '%s'", 607 | length, src, dst); 608 | } 609 | 610 | // size of the current chunk, limited by chunk_size 611 | long long unsigned int chunk; 612 | 613 | if (length - write_total > chunk_size) { 614 | // upper bound is the chunk_size 615 | chunk = chunk_size; 616 | } 617 | else { 618 | // limited by the file 619 | chunk = length - write_total; 620 | } 621 | 622 | chunk_id = file_offset / chunk_size; 623 | 624 | put_object_callback_data data; 625 | 626 | data.buffer_offset = 0; 627 | double before_lustre_read = ct_now(); 628 | pread(src_fd, uncompress_buf, chunk, file_offset); 629 | CT_TRACE("Reading a chunk from %s of %llu bytes offset %llu from lustre took %fs", 630 | src, chunk, file_offset, ct_now() - before_lustre_read); 631 | 632 | double before_compression = ct_now(); 633 | int compressed_size = LZ4_compress_default(uncompress_buf, compress_buf, chunk, compression_bound); 634 | CT_TRACE("Compressing a chunk from %s took %fs and the compressed size is %i bytes", 635 | src, ct_now() - before_compression, compressed_size); 636 | 637 | if (compressed_size <= 0) { 638 | rc = -1; 639 | CT_ERROR(rc, "Compression error"); 640 | goto out; 641 | } 642 | data.contentLength = compressed_size; 643 | data.buffer = compress_buf; 644 | 645 | S3PutObjectHandler putObjectHandler = 646 | { 647 | putResponseHandler, 648 | &putObjectDataCallback 649 | }; 650 | 651 | char dst_chunk_s[S3_MAX_KEY_SIZE]; 652 | snprintf(dst_chunk_s, sizeof(dst_chunk_s), "%s.%i", dst, chunk_id); 653 | 654 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 655 | getBucketName(sizeof(bucket_name), bucket_name, dst_chunk_s); 656 | 657 | // Get a local copy of the general bucketContext than overwrite the 658 | // pointer to the bucket_name 659 | S3BucketContext localbucketContext; 660 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 661 | localbucketContext.bucketName = bucket_name; 662 | 663 | double before_s3_put = ct_now(); 664 | int retry_count = RETRYCOUNT; 665 | do { 666 | S3_put_object(&localbucketContext, dst_chunk_s, compressed_size, &putProperties, NULL, &putObjectHandler, &data); 667 | } while (S3_status_is_retryable(data.status) && should_retry(&retry_count)); 668 | CT_TRACE("S3 put of %s took %fs", 669 | dst_chunk_s, ct_now() - before_s3_put); 670 | 671 | if (data.status != S3StatusOK) { 672 | rc = -EIO; 673 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(data.status)); 674 | goto out; 675 | } 676 | 677 | he.offset = file_offset; 678 | he.length = chunk; 679 | 680 | now = time(NULL); 681 | if (now >= last_report_time + ct_opt.o_report_int) { 682 | last_report_time = now; 683 | CT_TRACE("sending progress report for archiving %s", src); 684 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 685 | if (rc < 0) { 686 | /* Action has been canceled or something wrong 687 | * is happening. Stop copying data. */ 688 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 689 | src, dst); 690 | goto out; 691 | } 692 | } 693 | 694 | write_total += chunk; 695 | file_offset += chunk; 696 | } while (file_offset < length); 697 | rc = 0; 698 | 699 | // We need to delete every chunk of higher chunk_id if they 700 | // exists, this can happen if the new file is smaller 701 | // TODO only delete objects if this is a dirty write 702 | 703 | chunk_id += 1; 704 | do { 705 | char dst_s[S3_MAX_KEY_SIZE]; 706 | int retry_count; 707 | 708 | snprintf(dst_s, sizeof(dst_s), "%s.%i", dst, chunk_id); 709 | get_object_callback_data head_data; 710 | get_object_callback_data delete_data; 711 | 712 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 713 | getBucketName(sizeof(bucket_name), bucket_name, dst_s); 714 | 715 | // Get a local copy of the general bucketContext than overwrite the 716 | // pointer to the bucket_name 717 | S3BucketContext localbucketContext; 718 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 719 | localbucketContext.bucketName = bucket_name; 720 | 721 | CT_TRACE("Checking if chunk %i exists", chunk_id); 722 | retry_count = RETRYCOUNT; 723 | do { 724 | S3_head_object(&localbucketContext, dst_s, NULL, &headResponseHandler, &head_data); 725 | } while (S3_status_is_retryable(head_data.status) && should_retry(&retry_count)); 726 | 727 | if (head_data.status == S3StatusHttpErrorNotFound) { 728 | // Object do not exist, this mean we stop deleting chunks 729 | CT_TRACE("Chunk %i do not exists", chunk_id); 730 | break; 731 | } 732 | 733 | if (head_data.status != S3StatusOK) { 734 | rc = -EIO; 735 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(head_data.status)); 736 | goto out; 737 | } 738 | 739 | CT_TRACE("Deleting chunk %i", chunk_id); 740 | retry_count = RETRYCOUNT; 741 | do { 742 | S3_delete_object(&localbucketContext, dst_s, NULL, &deleteResponseHandler, &delete_data); 743 | } while (S3_status_is_retryable(delete_data.status) && should_retry(&retry_count)); 744 | 745 | if (delete_data.status != S3StatusOK) { 746 | rc = -EIO; 747 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(delete_data.status)); 748 | goto out; 749 | } 750 | 751 | chunk_id++; 752 | } while (true); 753 | 754 | out: 755 | if (uncompress_buf != NULL) 756 | free(uncompress_buf); 757 | if (compress_buf != NULL) 758 | free(compress_buf); 759 | 760 | CT_TRACE("copied "LPU64" bytes in %f seconds", 761 | length, ct_now() - start_ct_now); 762 | 763 | return rc; 764 | } 765 | 766 | static int ct_restore_data(struct hsm_copyaction_private *hcp, const char *src, 767 | const char *dst, int dst_fd, 768 | const struct hsm_action_item *hai, long hal_flags) 769 | { 770 | struct hsm_extent he; 771 | __u64 file_offset = hai->hai_extent.offset; 772 | struct stat dst_st; 773 | __u64 write_total = 0; 774 | __u64 length = hai->hai_extent.length; 775 | time_t last_report_time; 776 | time_t now; 777 | int rc = 0; 778 | double start_ct_now = ct_now(); 779 | 780 | // Restore a file from the object store back to Lustre 781 | 782 | CT_TRACE("Restoring %s to %s", src, dst); 783 | if (fstat(dst_fd, &dst_st) < 0) { 784 | rc = -errno; 785 | CT_ERROR(rc, "cannot stat '%s'", dst); 786 | return rc; 787 | } 788 | 789 | if (!S_ISREG(dst_st.st_mode)) { 790 | rc = -EINVAL; 791 | CT_ERROR(rc, "'%s' is not a regular file", dst); 792 | return rc; 793 | } 794 | 795 | he.offset = file_offset; 796 | he.length = 0; 797 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 798 | if (rc < 0) { 799 | /* Action has been canceled or something wrong 800 | * is happening. Stop copying data. */ 801 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 802 | src, dst); 803 | goto out; 804 | } 805 | 806 | errno = 0; 807 | 808 | last_report_time = time(NULL); 809 | 810 | long long int object_chunk_size = chunk_size; // will be assigned the correct value based on the metadata 811 | 812 | do { 813 | // Downloading from the object store 814 | 815 | char src_chunk_s[S3_MAX_KEY_SIZE]; 816 | 817 | S3GetObjectHandler getObjectHandler = 818 | { 819 | getResponseHandler, 820 | &getObjectDataCallback 821 | }; 822 | 823 | if (length == -1) { 824 | // Discover length and chunk size from the first object's metadata 825 | snprintf(src_chunk_s, sizeof(src_chunk_s), "%s.0", src); 826 | if (file_offset == 0) { 827 | // Download data and metadata from the first chunk 828 | get_object_callback_data data; 829 | rc = get_s3_object(src_chunk_s, &data, &getObjectHandler); 830 | if(rc < 0){ 831 | goto out; 832 | } 833 | 834 | length = data.totalLength; 835 | object_chunk_size = data.chunk_size; 836 | 837 | char *uncompress_buf = NULL; 838 | uncompress_buf = malloc(object_chunk_size); 839 | if (uncompress_buf == NULL) { 840 | rc = -ENOMEM; 841 | goto out; 842 | } 843 | 844 | double before_decompression = ct_now(); 845 | int decompressed_size = LZ4_decompress_safe(data.buffer, uncompress_buf, data.contentLength, object_chunk_size); 846 | if (decompressed_size < 0) { 847 | rc = -1; 848 | CT_ERROR(rc, "Decompression error"); 849 | goto out; 850 | } 851 | CT_TRACE("Decompressing a chunk from %s of %llu bytes took %fs and the uncompressed size is %i bytes", 852 | src, data.contentLength, ct_now() - before_decompression, decompressed_size); 853 | 854 | double before_lustre_write = ct_now(); 855 | pwrite(dst_fd, uncompress_buf, decompressed_size, file_offset); 856 | CT_TRACE("Writing a chunk from %s of %llu bytes offset %llu to lustre took %fs", 857 | src_chunk_s, object_chunk_size, file_offset, ct_now() - before_lustre_write); 858 | 859 | if (uncompress_buf != NULL) 860 | free(uncompress_buf); 861 | if (data.buffer != NULL) 862 | free(data.buffer); 863 | 864 | write_total = decompressed_size; 865 | file_offset += decompressed_size; 866 | 867 | he.offset = file_offset; 868 | he.length = data.contentLength; 869 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 870 | if (rc < 0) { 871 | /* Action has been canceled or something wrong 872 | * is happening. Stop copying data. */ 873 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 874 | src, dst); 875 | goto out; 876 | } 877 | 878 | if (write_total == length) { 879 | // Completed the full write with the first object 880 | rc = 0; 881 | break; 882 | } 883 | } 884 | else { 885 | // Only make a head request to get the metadata of the first object 886 | get_object_callback_data data; 887 | 888 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 889 | getBucketName(sizeof(bucket_name), bucket_name, src_chunk_s); 890 | 891 | // Get a local copy of the general bucketContext than overwrite the 892 | // pointer to the bucket_name 893 | S3BucketContext localbucketContext; 894 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 895 | localbucketContext.bucketName = bucket_name; 896 | 897 | int retry_count = RETRYCOUNT; 898 | do { 899 | S3_head_object(&localbucketContext, src_chunk_s, NULL, &headResponseHandler, &data); 900 | } while (S3_status_is_retryable(data.status) && should_retry(&retry_count)); 901 | 902 | if (data.status != S3StatusOK) { 903 | rc = -EIO; 904 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(data.status)); 905 | goto out; 906 | } 907 | object_chunk_size = data.chunk_size; 908 | length = data.totalLength; 909 | } 910 | } 911 | else { 912 | snprintf(src_chunk_s, sizeof(src_chunk_s), "%s.%llu", src, file_offset / object_chunk_size); 913 | 914 | long long unsigned int chunk; 915 | if (length - write_total > object_chunk_size) { 916 | // upper bound is the chunk_size 917 | chunk = object_chunk_size; 918 | } 919 | else { 920 | // limited by the file 921 | chunk = length - write_total; 922 | } 923 | 924 | get_object_callback_data data; 925 | rc = get_s3_object(src_chunk_s, &data, &getObjectHandler); 926 | if(rc < 0){ 927 | goto out; 928 | } 929 | 930 | char *uncompress_buf = NULL; 931 | uncompress_buf = malloc(object_chunk_size); 932 | if (uncompress_buf == NULL) { 933 | rc = -ENOMEM; 934 | goto out; 935 | } 936 | 937 | double before_decompression = ct_now(); 938 | int decompressed_size = LZ4_decompress_safe(data.buffer, uncompress_buf, data.contentLength, object_chunk_size); 939 | if (decompressed_size < 0) { 940 | rc = -1; 941 | CT_ERROR(rc, "Decompression error"); 942 | goto out; 943 | } 944 | CT_TRACE("Decompressing a chunk from %s of %llu bytes took %fs and the uncompressed size is %i", 945 | src, data.contentLength, ct_now() - before_decompression, decompressed_size); 946 | 947 | double before_lustre_write = ct_now(); 948 | pwrite(dst_fd, uncompress_buf, decompressed_size, file_offset); 949 | CT_TRACE("Writing a chunk from %s of %llu bytes offset %llu to lustre took %fs", 950 | src_chunk_s, chunk, file_offset, ct_now() - before_lustre_write); 951 | 952 | if (uncompress_buf != NULL) 953 | free(uncompress_buf); 954 | if (data.buffer != NULL) 955 | free(data.buffer); 956 | 957 | now = time(NULL); 958 | if (now >= last_report_time + ct_opt.o_report_int) { 959 | last_report_time = now; 960 | CT_TRACE("sending progress report for restoring %s", src); 961 | rc = llapi_hsm_action_progress(hcp, &he, length, 0); 962 | if (rc < 0) { 963 | /* Action has been canceled or something wrong 964 | * is happening. Stop copying data. */ 965 | CT_ERROR(rc, "progress ioctl for copy '%s'->'%s' failed", 966 | src, dst); 967 | goto out; 968 | } 969 | } 970 | 971 | write_total += decompressed_size; 972 | file_offset += decompressed_size; 973 | } 974 | rc = 0; 975 | } while (file_offset < length); 976 | 977 | if (hai->hai_action == HSMA_RESTORE) { 978 | /* 979 | * truncate restored file 980 | * size is taken from the archive this is done to support 981 | * restore after a force release which leaves the file with the 982 | * wrong size (can big bigger than the new size) 983 | * make sure the file is on disk before reporting success. 984 | */ 985 | rc = ftruncate(dst_fd, length); 986 | if (rc < 0) { 987 | rc = -errno; 988 | CT_ERROR(rc, "cannot truncate '%s' to size %llu", 989 | dst, length); 990 | err_major++; 991 | } 992 | } 993 | 994 | out: 995 | CT_TRACE("copied "LPU64" bytes in %f seconds", 996 | length, ct_now() - start_ct_now); 997 | 998 | return rc; 999 | } 1000 | 1001 | int ct_archive(const struct hsm_action_item *hai, const long hal_flags) 1002 | { 1003 | struct hsm_copyaction_private *hcp = NULL; 1004 | char src[PATH_MAX]; 1005 | char dst[PATH_MAX] = ""; 1006 | int rc; 1007 | int rcf = 0; 1008 | int hp_flags = 0; 1009 | int open_flags; 1010 | int src_fd = -1; 1011 | 1012 | rc = ct_begin(&hcp, hai); 1013 | if (rc < 0) 1014 | goto end_ct_archive; 1015 | 1016 | /* we fill archive so: 1017 | * source = data FID 1018 | * destination = lustre FID 1019 | */ 1020 | ct_path_lustre(src, sizeof(src), ct_opt.o_mnt, &hai->hai_dfid); 1021 | ct_path_archive(dst, sizeof(dst), &hai->hai_fid); 1022 | 1023 | CT_TRACE("archiving '%s' to '%s'", src, dst); 1024 | 1025 | if (ct_opt.o_dry_run) { 1026 | rc = 0; 1027 | goto end_ct_archive; 1028 | } 1029 | 1030 | src_fd = llapi_hsm_action_get_fd(hcp); 1031 | if (src_fd < 0) { 1032 | rc = src_fd; 1033 | CT_ERROR(rc, "cannot open '%s' for read", src); 1034 | goto end_ct_archive; 1035 | } 1036 | 1037 | open_flags = O_WRONLY | O_NOFOLLOW; 1038 | /* If extent is specified, don't truncate an old archived copy */ 1039 | open_flags |= ((hai->hai_extent.length == -1) ? O_TRUNC : 0) | O_CREAT; 1040 | 1041 | rc = ct_archive_data(hcp, src, dst, src_fd, hai, hal_flags); 1042 | if (rc < 0) { 1043 | CT_ERROR(rc, "data copy failed from '%s' to '%s'", src, dst); 1044 | goto end_ct_archive; 1045 | } 1046 | 1047 | CT_TRACE("data archiving for '%s' to '%s' done", src, dst); 1048 | 1049 | end_ct_archive: 1050 | err_major++; 1051 | 1052 | unlink(dst); 1053 | if (ct_is_retryable(rc)) 1054 | hp_flags |= HP_FLAG_RETRY; 1055 | 1056 | rcf = rc; 1057 | 1058 | if (!(src_fd < 0)) 1059 | close(src_fd); 1060 | 1061 | rc = ct_action_done(&hcp, hai, hp_flags, rcf); 1062 | 1063 | return rc; 1064 | } 1065 | 1066 | int ct_restore(const struct hsm_action_item *hai, const long hal_flags) 1067 | { 1068 | struct hsm_copyaction_private *hcp = NULL; 1069 | struct lu_fid dfid; 1070 | char src[PATH_MAX]; 1071 | char dst[PATH_MAX]; 1072 | int rc; 1073 | int hp_flags = 0; 1074 | int dst_fd = -1; 1075 | int mdt_index = -1; 1076 | int open_flags = 0; 1077 | /* we fill lustre so: 1078 | * source = lustre FID in the backend 1079 | * destination = data FID = volatile file 1080 | */ 1081 | 1082 | /* build backend file name from released file FID */ 1083 | ct_path_archive(src, sizeof(src), &hai->hai_fid); 1084 | 1085 | rc = llapi_get_mdt_index_by_fid(ct_opt.o_mnt_fd, &hai->hai_fid, 1086 | &mdt_index); 1087 | if (rc < 0) { 1088 | CT_ERROR(rc, "cannot get mdt index "DFID"", 1089 | PFID(&hai->hai_fid)); 1090 | return rc; 1091 | } 1092 | 1093 | rc = ct_begin_restore(&hcp, hai, mdt_index, open_flags); 1094 | if (rc < 0) 1095 | goto end_ct_restore; 1096 | 1097 | /* get the FID of the volatile file */ 1098 | rc = llapi_hsm_action_get_dfid(hcp, &dfid); 1099 | if (rc < 0) { 1100 | CT_ERROR(rc, "restoring "DFID 1101 | ", cannot get FID of created volatile file", 1102 | PFID(&hai->hai_fid)); 1103 | goto end_ct_restore; 1104 | } 1105 | 1106 | /* build volatile "file name", for messages */ 1107 | snprintf(dst, sizeof(dst), "{VOLATILE}="DFID, PFID(&dfid)); 1108 | 1109 | CT_TRACE("restoring data from '%s' to '%s'", src, dst); 1110 | 1111 | if (ct_opt.o_dry_run) { 1112 | rc = 0; 1113 | goto end_ct_restore; 1114 | } 1115 | 1116 | dst_fd = llapi_hsm_action_get_fd(hcp); 1117 | if (dst_fd < 0) { 1118 | rc = dst_fd; 1119 | CT_ERROR(rc, "cannot open '%s' for write", dst); 1120 | goto end_ct_restore; 1121 | } 1122 | 1123 | rc = ct_restore_data(hcp, src, dst, dst_fd, hai, hal_flags); 1124 | if (rc < 0) { 1125 | CT_ERROR(rc, "cannot copy data from '%s' to '%s'", 1126 | src, dst); 1127 | err_major++; 1128 | if (ct_is_retryable(rc)) 1129 | hp_flags |= HP_FLAG_RETRY; 1130 | goto end_ct_restore; 1131 | } 1132 | 1133 | CT_TRACE("data restore from '%s' to '%s' done", src, dst); 1134 | 1135 | end_ct_restore: 1136 | rc = ct_action_done(&hcp, hai, hp_flags, rc); 1137 | 1138 | /* object swaping is done by cdt at copy end, so close of volatile file 1139 | * cannot be done before */ 1140 | 1141 | if (!(dst_fd < 0)) 1142 | close(dst_fd); 1143 | 1144 | return rc; 1145 | } 1146 | 1147 | int ct_remove(const struct hsm_action_item *hai, const long hal_flags) 1148 | { 1149 | struct hsm_copyaction_private *hcp = NULL; 1150 | char dst[PATH_MAX]; 1151 | int rc; 1152 | int retry_count; 1153 | char dst_s[S3_MAX_KEY_SIZE]; 1154 | 1155 | rc = ct_begin(&hcp, hai); 1156 | if (rc < 0) 1157 | goto end_ct_remove; 1158 | 1159 | ct_path_archive(dst, sizeof(dst), &hai->hai_fid); 1160 | 1161 | CT_TRACE("removing file '%s'", dst); 1162 | 1163 | if (ct_opt.o_dry_run) { 1164 | rc = 0; 1165 | goto end_ct_remove; 1166 | } 1167 | 1168 | // Get the metadata from the first object to get the number of chunks 1169 | get_object_callback_data data; 1170 | 1171 | snprintf(dst_s, sizeof(dst_s), "%s.0", dst); 1172 | 1173 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 1174 | getBucketName(sizeof(bucket_name), bucket_name, dst_s); 1175 | 1176 | // Get a local copy of the general bucketContext than overwrite the 1177 | // pointer to the bucket_name 1178 | S3BucketContext localbucketContext; 1179 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 1180 | localbucketContext.bucketName = bucket_name; 1181 | 1182 | retry_count = RETRYCOUNT; 1183 | do { 1184 | S3_head_object(&localbucketContext, dst_s, NULL, &headResponseHandler, &data); 1185 | } while (S3_status_is_retryable(data.status) && should_retry(&retry_count)); 1186 | 1187 | if (data.status != S3StatusOK) { 1188 | rc = -EIO; 1189 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(data.status)); 1190 | goto end_ct_remove; 1191 | } 1192 | 1193 | int chunk; 1194 | for (chunk = data.totalLength / data.chunk_size; chunk >= 0; chunk--) { 1195 | snprintf(dst_s, sizeof(dst_s), "%s.%i", dst, chunk); 1196 | get_object_callback_data delete_data; 1197 | 1198 | CT_TRACE("Deleting chunk '%s'", dst_s); 1199 | 1200 | char bucket_name[S3_MAX_BUCKET_NAME_SIZE]; 1201 | getBucketName(sizeof(bucket_name), bucket_name, dst_s); 1202 | 1203 | // Get a local copy of the general bucketContext than overwrite the 1204 | // pointer to the bucket_name 1205 | S3BucketContext localbucketContext; 1206 | memcpy(&localbucketContext, &bucketContext, sizeof(S3BucketContext)); 1207 | localbucketContext.bucketName = bucket_name; 1208 | 1209 | retry_count = RETRYCOUNT; 1210 | do { 1211 | S3_delete_object(&localbucketContext, dst_s, NULL, &deleteResponseHandler, &delete_data); 1212 | } while (S3_status_is_retryable(delete_data.status) && should_retry(&retry_count)); 1213 | 1214 | if (delete_data.status != S3StatusOK) { 1215 | rc = -EIO; 1216 | CT_ERROR(rc, "S3Error %s", S3_get_status_name(delete_data.status)); 1217 | goto end_ct_remove; 1218 | } 1219 | } 1220 | rc = 0; 1221 | 1222 | end_ct_remove: 1223 | rc = ct_action_done(&hcp, hai, 0, rc); 1224 | 1225 | return rc; 1226 | } 1227 | 1228 | int ct_cancel(const struct hsm_action_item *hai, const long hal_flags) 1229 | { 1230 | CT_TRACE("cancel not implemented for file system '%s'", ct_opt.o_mnt); 1231 | /* Don't report progress to coordinator for this cookie: 1232 | * the copy function will get ECANCELED when reporting 1233 | * progress. */ 1234 | return 0; 1235 | } 1236 | 1237 | static int ct_s3_cleanup(void) 1238 | { 1239 | int rc = 0; 1240 | 1241 | rc = ct_cleanup(); 1242 | if (rc == 0) { 1243 | S3_deinitialize(); 1244 | } 1245 | 1246 | return rc; 1247 | } 1248 | 1249 | int main(int argc, char **argv) 1250 | { 1251 | int rc; 1252 | 1253 | strlcpy(cmd_name, basename(argv[0]), sizeof(cmd_name)); 1254 | rc = ct_parseopts(argc, argv); 1255 | if (rc < 0) { 1256 | CT_WARN("try '%s --help' for more information", cmd_name); 1257 | return -rc; 1258 | } 1259 | 1260 | rc = ct_setup(); 1261 | if (rc < 0) 1262 | goto error_cleanup; 1263 | 1264 | rc = S3_initialize(NULL, S3_INIT_ALL, host); 1265 | if(rc != 0){ 1266 | CT_ERROR(rc, "Error in S3 init"); 1267 | goto error_cleanup; 1268 | } 1269 | 1270 | rc = ct_run(); 1271 | 1272 | error_cleanup: 1273 | ct_s3_cleanup(); 1274 | 1275 | return -rc; 1276 | } 1277 | 1278 | --------------------------------------------------------------------------------