├── memscrimper_cpp_implementation ├── performance_graphs │ ├── mscr_compr_with_cacheflush.png │ ├── mscr_compr_without_cacheflush.png │ ├── mscr_decompr_with_cacheflush.png │ ├── mscr_decompr_without_cacheflush.png │ ├── mscr_compr_cache_measurement_combined.png │ └── mscr_decompr_cache_measurement_combined.png ├── mscr_client.py ├── src │ ├── interdedup_decompress.h │ ├── interdedup_compress.h │ ├── memdump.h │ ├── socket_api.h │ ├── utils.h │ ├── request_handler.h │ ├── memdump.cpp │ ├── utils.cpp │ ├── request_handler.cpp │ ├── socket_api.cpp │ ├── memscrimper.cpp │ ├── interdedup_decompress.cpp │ └── interdedup_compress.cpp ├── LICENSE ├── CMakeLists.txt ├── readme.md ├── mscr_client_python2.py └── mscr_client_python3.py ├── LICENSE ├── README.md └── memscrimper_poc ├── compress_7zip.py ├── compress_gzip.py ├── compress_bzip2.py ├── compress_intradedup.py ├── util.py └── compress_interdedup.py /memscrimper_cpp_implementation/performance_graphs/mscr_compr_with_cacheflush.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_compr_with_cacheflush.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/performance_graphs/mscr_compr_without_cacheflush.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_compr_without_cacheflush.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/performance_graphs/mscr_decompr_with_cacheflush.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_decompr_with_cacheflush.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/performance_graphs/mscr_decompr_without_cacheflush.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_decompr_without_cacheflush.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/performance_graphs/mscr_compr_cache_measurement_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_compr_cache_measurement_combined.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/performance_graphs/mscr_decompr_cache_measurement_combined.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbrengel/memscrimper/HEAD/memscrimper_cpp_implementation/performance_graphs/mscr_decompr_cache_measurement_combined.png -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/mscr_client.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # Copyright [2019] 3 | 4 | import sys 5 | 6 | # check running python version and load corresponding mscr_client version 7 | if sys.version_info.major == 3: 8 | from mscr_client_python3 import * 9 | else: 10 | from mscr_client_python2 import * 11 | 12 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/interdedup_decompress.h: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #ifndef INTERDEDUP_DECOMPRESS_H_ 4 | #define INTERDEDUP_DECOMPRESS_H_ 5 | 6 | #include "./request_handler.h" 7 | 8 | namespace mscr { 9 | 10 | void interdedup_decompress(request_handler *handler, const char *filename_in, 11 | const char *filename_out); 12 | 13 | } // namespace mscr 14 | 15 | #endif // INTERDEDUP_DECOMPRESS_H_ 16 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/interdedup_compress.h: -------------------------------------------------------------------------------- 1 | // Copyright {2019] 2 | 3 | #ifndef INTERDEDUP_COMPRESS_H_ 4 | #define INTERDEDUP_COMPRESS_H_ 5 | 6 | #include 7 | 8 | #include "memdump.h" 9 | 10 | namespace mscr { 11 | 12 | enum class compression {ZIP7, GZIP, BZIP2, NOINNER}; 13 | 14 | void interdedup_compress(std::shared_ptr ref, const memdump &srcdump, 15 | const char *filename, compression inner, bool diffing, 16 | bool intra); 17 | 18 | } // namespace mscr 19 | 20 | #endif // INTERDEDUP_COMPRESS_H_ 21 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/memdump.h: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #ifndef MEMDUMP_H_ 4 | #define MEMDUMP_H_ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace mscr { 12 | 13 | class memdump { 14 | public: 15 | explicit memdump(const std::string &name); 16 | std::string getPath() const; 17 | const std::unordered_map> * getPages() const; 18 | std::unordered_map * getNumToPage(); 19 | int readDumpfile(uint32_t pagesize); 20 | 21 | private: 22 | std::mutex mu_; 23 | std::string path_; 24 | std::unordered_map> page_map_; 25 | std::unordered_map num_to_page_; 26 | }; 27 | 28 | } // namespace mscr 29 | 30 | #endif // MEMDUMP_H_ 31 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/socket_api.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 2 | 3 | #ifndef SOCKET_API_H_ 4 | #define SOCKET_API_H_ 5 | 6 | #include 7 | #include 8 | 9 | #include "request_handler.h" 10 | 11 | namespace mscr { 12 | 13 | class command_socket { 14 | public: 15 | explicit command_socket(const std::string &sock_path, request_handler* handler); 16 | ~command_socket(); 17 | void start_listen(); 18 | 19 | private: 20 | static bool shutdown_; 21 | request_handler* handler_; 22 | std::string sock_path_; 23 | int srv_sock_; 24 | struct sockaddr_un srv_addr_; 25 | std::unordered_set open_socks_; 26 | std::unordered_map timeout_counters_; 27 | static void sig_handler(int _); 28 | void handle_client_connection(int cl_sock, int epfd, struct epoll_event* ev); 29 | }; 30 | 31 | } // namespace mscr 32 | 33 | #endif // SOCKET_API_H_ 34 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/utils.h: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #ifndef UTILS_H_ 4 | #define UTILS_H_ 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace mscr { 14 | 15 | std::vector int_to_byte_BE(uint32_t number, int bytelen); 16 | std::string read_string(std::istream *file); 17 | uint64_t read_num_LE(std::istream *file, int length); 18 | uint64_t read_num_LE(const char *array_begin, int length); 19 | bool str_starts_with(const std::string &str, std::string prefix); 20 | uint64_t get_filesize(const char *filename); 21 | 22 | template 23 | std::vector int_to_byte_LE(T number, int bytelen) { 24 | std::vector result(bytelen); 25 | for (int i = 0; i < bytelen; ++i) { 26 | result[i] = number >> (i * 8); 27 | } 28 | return result; 29 | } 30 | 31 | } // namespace mscr 32 | 33 | #endif // UTILS_H 34 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/request_handler.h: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #ifndef REQUEST_HANDLER_H_ 4 | #define REQUEST_HANDLER_H_ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include "memdump.h" 15 | 16 | namespace mscr { 17 | 18 | class request_handler { 19 | public: 20 | explicit request_handler(uint32_t thread_count); 21 | ~request_handler(); 22 | int handle_request(std::string msg); 23 | std::shared_ptr get_refdump(std::string path, uint32_t pagesize); 24 | 25 | private: 26 | std::vector> refdumps_; 27 | std::recursive_mutex mu_; 28 | boost::asio::thread_pool worker_threads_; 29 | int add_reference(std::string msg); 30 | int del_reference(std::string msg); 31 | int compress_dump(std::string msg); 32 | int decompress_dump(std::string msg); 33 | int find_refdump(const std::string &path); 34 | }; 35 | 36 | } // namespace mscr 37 | 38 | #endif // REQUEST_HANDLER_H 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Michael Brengel 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019, Michael Brengel, Daniel Weber 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/memdump.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "memdump.h" 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | 11 | namespace mscr { 12 | 13 | memdump::memdump(const std::string &path) : path_(path) {} 14 | 15 | 16 | std::string memdump::getPath() const { 17 | return this->path_; 18 | } 19 | 20 | 21 | const std::unordered_map>* 22 | memdump::getPages() const { 23 | return &(this->page_map_); 24 | } 25 | 26 | 27 | std::unordered_map* memdump::getNumToPage() { 28 | // num_to_page map is evaluated lazily on demand to save memory 29 | 30 | std::lock_guard m_lock(this->mu_); 31 | if (this->num_to_page_.empty()) { 32 | for (auto key_value : this->page_map_) { 33 | for (uint32_t pagenum : key_value.second) { 34 | this->num_to_page_[pagenum] = key_value.first; 35 | } 36 | } 37 | } 38 | return &this->num_to_page_; 39 | } 40 | 41 | 42 | int memdump::readDumpfile(uint32_t pagesize) { 43 | // open file 44 | std::ifstream file(this->path_.c_str(), std::ios::binary); 45 | 46 | if (file.fail()) { 47 | BOOST_LOG_TRIVIAL(error) << "Could not open " << this->path_; 48 | return 1; 49 | } 50 | 51 | // read file page by page 52 | uint32_t pagenr = 0; 53 | auto page_content = new char[pagesize]; 54 | while (file.read(page_content, pagesize)) { 55 | std::string page_content_str(page_content, page_content + pagesize); 56 | (this->page_map_[std::move(page_content_str)]).insert(pagenr); 57 | pagenr++; 58 | } 59 | 60 | // check if we consumed the whole file 61 | if (!file.eof()) { 62 | BOOST_LOG_TRIVIAL(error) << "reading file failed (did not reach eof)"; 63 | delete[](page_content); 64 | return 1; 65 | } 66 | 67 | // clean up 68 | delete[](page_content); 69 | BOOST_LOG_TRIVIAL(debug) << "finished reading " << this->path_; 70 | 71 | return 0; 72 | } 73 | 74 | } // namespace mscr 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MemScrimper 2 | This repository contains the code for the DIMVA 2018 paper: "MemScrimper: Time- and Space-Efficient Storage of Malware Sandbox Memory Dumps". Please note that in its current state, the code is a PoC and not a full-fledged production-ready application. 3 | 4 | # Abstract 5 | MemScrimper is a a novel methodology to compress memory dumps of malware sandboxes. MemScrimper is built on the observation that sandboxes always start at the same system state (i.e., a sandbox snapshot) to analyze malware. Therefore, memory dumps taken after malware execution inside the same sandbox are substantially similar to each other, which we can use to only store the differences introduced by the malware itself. Technically, we compare the pages of those memory dumps against the pages of a reference memory dump taken from the same sandbox and then deduplicate identical or similar pages accordingly. MemScrimper increases data compression ratios by up to 3894.74% compared to standard compression utilities such as `7zip`, and reduces compression and decompression times by up to 72.48% and 41.44, respectively. Furthermore, MemScrimper's internal storage allows to perform analyses (e.g., signature matching) on compressed memory dumps more efficient than on uncompressed dumps. MemScrimper thus significantly increases the retention time of memory dumps and makes longitudinal analysis more viable, while also improving efficiency. 6 | 7 | # Paper 8 | The paper is available [here](https://christian-rossow.de/publications/memscrimper-dimva2018.pdf). You can cite it with the following BibTeX entry: 9 | ``` 10 | @inproceedings{MemScrimper, 11 | author = {Michael Brengel and Christian Rossow}, 12 | title = {{\textsc{MemScrimper}: Time- and Space-Efficient Storage of Malware Sandbox Memory Dumps}}, 13 | booktitle = {Proceedings of the Conference on Detection of Intrusions and Malware, and Vulnerability Assessment~(DIMVA)}, 14 | year = {2018} 15 | } 16 | ``` 17 | 18 | # Interested in more of our research? 19 | [Come visit us](http://syssec.mmci.uni-saarland.de/). 20 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/utils.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | 16 | namespace mscr { 17 | 18 | std::vector int_to_byte_BE(uint32_t number, int bytelen) { 19 | std::vector result(bytelen); 20 | 21 | for (int i = 0; i < bytelen; ++i) { 22 | result[bytelen - i - 1] = number >> (i * 8); 23 | } 24 | return result; 25 | } 26 | 27 | 28 | uint64_t read_num_LE(std::istream *file, int length) { 29 | assert(length <= 8); 30 | char c; 31 | uint64_t number = 0; 32 | 33 | // read byte by byte 34 | for (int i = 0; i < length; i++) { 35 | file->read(&c, 1); 36 | // make sure to first cast to uint8_t because of sign-extension issues 37 | number += static_cast(static_cast(c)) << (8 * i); 38 | } 39 | return number; 40 | } 41 | 42 | 43 | uint64_t read_num_LE(const char *array_begin, int length) { 44 | assert(length <= 8); 45 | char c; 46 | uint64_t number = 0; 47 | 48 | // go through the array byte by byte 49 | for (int i = 0; i < length; i++) { 50 | c = array_begin[i]; 51 | 52 | // make sure to first cast to uint8_t because of sign-extension issues 53 | number += static_cast(static_cast(c)) << (8 * i); 54 | } 55 | return number; 56 | } 57 | 58 | 59 | std::string read_string(std::istream *file) { 60 | char c; 61 | std::string result; 62 | file->read(&c, 1); 63 | while (c != '\0') { 64 | result += c; 65 | file->read(&c, 1); 66 | } 67 | return result; 68 | } 69 | 70 | 71 | bool str_starts_with(const std::string &str, std::string prefix) { 72 | return !str.compare(0, prefix.size(), prefix); 73 | } 74 | 75 | 76 | uint64_t get_filesize(const char *filename) { 77 | std::ifstream in(filename, std::ifstream::binary | std::ifstream::ate); 78 | 79 | if (in.fail()) { 80 | return 0; 81 | } 82 | return static_cast(in.tellg()); 83 | } 84 | 85 | } // namespace mscr 86 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(memscrimper) 3 | 4 | if(STATIC_BUILD) 5 | message("-- Building statically-linked binary") 6 | set(Boost_USE_STATIC_LIBS ON) 7 | endif() 8 | if(NOT STATIC_BUILD) 9 | message("-- Building dynamically-linked binary") 10 | endif() 11 | 12 | find_package(ZLIB REQUIRED) 13 | find_package(BZip2 REQUIRED) 14 | find_package(Threads REQUIRED) 15 | find_package(Boost COMPONENTS system log log_setup thread iostreams REQUIRED) 16 | 17 | set(default_build_type "Release") 18 | # check if build type was specified 19 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 20 | message(STATUS "Setting build type to '${default_build_type}' as none was specified.") 21 | # overwrite empty build type 22 | set(CMAKE_BUILD_TYPE "${default_build_type}") 23 | endif() 24 | 25 | set(CMAKE_CXX_STANDARD 11) 26 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 27 | set(CMAKE_CXX_EXTENSIONS OFF) 28 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Werror -Wextra -pedantic") 29 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DDEBUGMODE=0") 30 | set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DDEBUGMODE=1") 31 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUGMODE=1") 32 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address -fsanitize=undefined") 33 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize-recover=address") 34 | 35 | add_executable(memscrimper 36 | src/memscrimper.cpp src/memdump.cpp src/memdump.h src/interdedup_compress.cpp 37 | src/interdedup_compress.h src/utils.cpp src/utils.h src/interdedup_decompress.cpp 38 | src/interdedup_decompress.h src/socket_api.cpp src/socket_api.h src/request_handler.cpp 39 | src/request_handler.h) 40 | 41 | if(STATIC_BUILD) 42 | find_library(ZLIB_STATIC libz.a) 43 | find_library(BZIP2_STATIC libbz2.a) 44 | target_link_libraries(memscrimper -static) 45 | target_link_libraries(memscrimper ${Boost_LIBRARIES} ${BZIP2_STATIC} ${ZLIB_STATIC} 46 | ${CMAKE_THREAD_LIBS_INIT}) 47 | endif() 48 | if(NOT STATIC_BUILD) 49 | add_definitions("-DBOOST_LOG_DYN_LINK") 50 | target_link_libraries(memscrimper ${Boost_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) 51 | endif() 52 | -------------------------------------------------------------------------------- /memscrimper_poc/compress_7zip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | import util 5 | import sys 6 | import os 7 | import logging 8 | import gzip 9 | import struct 10 | import subprocess 11 | 12 | 13 | def compress(source, target, pagesize=4096): 14 | logging.debug("Starting compression of %s to %s", repr(source), repr(target)) 15 | logging.debug("Page size: %d", pagesize) 16 | size = os.path.getsize(source) 17 | with open(target, "wb") as ftarget: 18 | ftarget.write(util.create_header("7zip", size)) 19 | ftarget.flush() 20 | p = subprocess.Popen(["7za", "a", "-an", "-txz", "-mx=9", "-so", source], stdout=ftarget, stderr=subprocess.PIPE) 21 | p.communicate() 22 | logging.debug("Done") 23 | 24 | 25 | def decompress(source, target): 26 | logging.debug("Starting decompression of %s to %s", repr(source), repr(target)) 27 | with open(source, "rb") as fsource: 28 | logging.debug("Parsing header") 29 | magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header(fsource) 30 | logging.debug(" Magic number: %s", repr(magic)) 31 | logging.debug(" Method: %s", repr(method)) 32 | logging.debug(" Major version number: %d", majorversion) 33 | logging.debug(" Minor version number: %d", minorversion) 34 | logging.debug(" Page size: %d", pagesize) 35 | logging.debug(" Uncompressed size: %d", uncompressed_size) 36 | fsource.flush() 37 | with open(target, "wb") as ftarget: 38 | p = subprocess.Popen(["7za", "x", "-an", "-txz", "-si", "-so"], stdin=fsource, stdout=ftarget, stderr=subprocess.PIPE) 39 | p.communicate() 40 | logging.debug("Done") 41 | 42 | 43 | def main(argv): 44 | # set up logging 45 | util.create_dir("logs") 46 | util.configure_logging("7zip", "logs/7zip.log") 47 | 48 | # check args 49 | if len(argv) != 4: 50 | print "Usage: {} ".format(argv[0]) 51 | return -1 52 | 53 | # check if first argument is valid 54 | if argv[1] != "c" and argv[1] != "d": 55 | logging.error("First argument %s should be 'c' or 'd'", repr(argv[1])) 56 | return -1 57 | 58 | # check if files do (not) exist 59 | source = argv[2] 60 | target = argv[3] 61 | if not os.path.isfile(source): 62 | logging.error("Source %s does not exist", repr(source)) 63 | return -1 64 | if os.path.isfile(target) and os.path.getsize(target) > 0: 65 | logging.error("Target %s already exists and is non-empty", repr(target)) 66 | return -1 67 | 68 | # compress/decompress 69 | if argv[1] == "c": 70 | compress(source, target) 71 | else: 72 | decompress(source, target) 73 | 74 | return 0 75 | 76 | if __name__ == "__main__": 77 | sys.exit(main(sys.argv)) 78 | -------------------------------------------------------------------------------- /memscrimper_poc/compress_gzip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | import util 5 | import sys 6 | import os 7 | import logging 8 | import gzip 9 | import struct 10 | 11 | 12 | def compress(source, target, pagesize=4096): 13 | logging.debug("Starting compression of %s to %s", repr(source), repr(target)) 14 | logging.debug("Page size: %d", pagesize) 15 | size = os.path.getsize(source) 16 | with open(target, "wb") as ftarget: 17 | ftarget.write(util.create_header("gzip", size)) 18 | with gzip.GzipFile(fileobj=ftarget, mode="wb", compresslevel=9) as ftarget: 19 | for i, page in enumerate(util.get_pages(source, pagesize=pagesize)): 20 | if i % 100 == 0 or (i+1) * pagesize == size: 21 | sys.stdout.write("\rProgress: {:.2f}%".format(float(i * pagesize) / size * 100)) 22 | sys.stdout.flush() 23 | ftarget.write(page) 24 | sys.stdout.write("\n") 25 | logging.debug("Done") 26 | 27 | 28 | def decompress(source, target): 29 | logging.debug("Starting decompression of %s to %s", repr(source), repr(target)) 30 | with open(source, "rb") as fsource: 31 | logging.debug("Parsing header") 32 | magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header(fsource) 33 | logging.debug(" Magic number: %s", repr(magic)) 34 | logging.debug(" Method: %s", repr(method)) 35 | logging.debug(" Major version number: %d", majorversion) 36 | logging.debug(" Minor version number: %d", minorversion) 37 | logging.debug(" Page size: %d", pagesize) 38 | logging.debug(" Uncompressed size: %d", uncompressed_size) 39 | with open(target, "wb") as ftarget: 40 | curr_size = 0.0 41 | pagecnt = 0 42 | with gzip.GzipFile(fileobj=fsource, mode="rb", compresslevel=9) as fsource: 43 | while True: 44 | if pagecnt % 100 == 0 or curr_size == uncompressed_size: 45 | sys.stdout.write("\rProgress: {:.2f}%".format(curr_size / uncompressed_size * 100)) 46 | sys.stdout.flush() 47 | page = fsource.read(pagesize) 48 | if not page: 49 | break 50 | ftarget.write(page) 51 | curr_size += len(page) 52 | pagecnt += 1 53 | sys.stdout.write("\n") 54 | logging.debug("Done") 55 | 56 | 57 | def main(argv): 58 | # set up logging 59 | util.create_dir("logs") 60 | util.configure_logging("gzip", "logs/gzip.log") 61 | 62 | # check args 63 | if len(argv) != 4: 64 | print "Usage: {} ".format(argv[0]) 65 | return -1 66 | 67 | # check if first argument is valid 68 | if argv[1] != "c" and argv[1] != "d": 69 | logging.error("First argument %s should be 'c' or 'd'", repr(argv[1])) 70 | return -1 71 | 72 | # check if files do (not) exist 73 | source = argv[2] 74 | target = argv[3] 75 | if not os.path.isfile(source): 76 | logging.error("Source %s does not exist", repr(source)) 77 | return -1 78 | if os.path.isfile(target) and os.path.getsize(target) > 0: 79 | logging.error("Target %s already exists and is non-empty", repr(target)) 80 | return -1 81 | 82 | # compress/decompress 83 | if argv[1] == "c": 84 | compress(source, target) 85 | else: 86 | decompress(source, target) 87 | 88 | return 0 89 | 90 | if __name__ == "__main__": 91 | sys.exit(main(sys.argv)) 92 | -------------------------------------------------------------------------------- /memscrimper_poc/compress_bzip2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | import util 5 | import sys 6 | import os 7 | import logging 8 | import struct 9 | 10 | # bz2 does not allow fileobjects in BZ2File 11 | import bz2file 12 | 13 | 14 | def compress(source, target, pagesize=4096): 15 | logging.debug("Starting compression of %s to %s", repr(source), repr(target)) 16 | logging.debug("Page size: %d", pagesize) 17 | size = os.path.getsize(source) 18 | with open(target, "wb") as ftarget: 19 | ftarget.write(util.create_header("bzip2", size)) 20 | with bz2file.BZ2File(filename=ftarget, mode="wb", compresslevel=9) as ftarget: 21 | for i, page in enumerate(util.get_pages(source, pagesize=pagesize)): 22 | if i % 100 == 0 or (i+1) * pagesize == size: 23 | sys.stdout.write("\rProgress: {:.2f}%".format(float(i * pagesize) / size * 100)) 24 | sys.stdout.flush() 25 | ftarget.write(page) 26 | sys.stdout.write("\n") 27 | logging.debug("Done") 28 | 29 | 30 | def decompress(source, target): 31 | logging.debug("Starting decompression of %s to %s", repr(source), repr(target)) 32 | with open(source, "rb") as fsource: 33 | logging.debug("Parsing header") 34 | magic, method, majorversion, minorversion, pagesize, uncompressed_size = util.parse_header(fsource) 35 | logging.debug(" Magic number: %s", repr(magic)) 36 | logging.debug(" Method: %s", repr(method)) 37 | logging.debug(" Major version number: %d", majorversion) 38 | logging.debug(" Minor version number: %d", minorversion) 39 | logging.debug(" Page size: %d", pagesize) 40 | logging.debug(" Uncompressed size: %d", uncompressed_size) 41 | with open(target, "wb") as ftarget: 42 | curr_size = 0.0 43 | pagecnt = 0 44 | with bz2file.BZ2File(filename=fsource, mode="rb", compresslevel=9) as fsource: 45 | while True: 46 | if pagecnt % 100 == 0 or curr_size == uncompressed_size: 47 | sys.stdout.write("\rProgress: {:.2f}%".format(curr_size / uncompressed_size * 100)) 48 | sys.stdout.flush() 49 | page = fsource.read(pagesize) 50 | if not page: 51 | break 52 | ftarget.write(page) 53 | curr_size += len(page) 54 | pagecnt += 1 55 | sys.stdout.write("\n") 56 | logging.debug("Done") 57 | 58 | 59 | def main(argv): 60 | # set up logging 61 | util.create_dir("logs") 62 | util.configure_logging("bzip2", "logs/bzip2.log") 63 | 64 | # check args 65 | if len(argv) != 4: 66 | print "Usage: {} ".format(argv[0]) 67 | return -1 68 | 69 | # check if first argument is valid 70 | if argv[1] != "c" and argv[1] != "d": 71 | logging.error("First argument %s should be 'c' or 'd'", repr(argv[1])) 72 | return -1 73 | 74 | # check if files do (not) exist 75 | source = argv[2] 76 | target = argv[3] 77 | if not os.path.isfile(source): 78 | logging.error("Source %s does not exist", repr(source)) 79 | return -1 80 | if os.path.isfile(target) and os.path.getsize(target) > 0: 81 | logging.error("Target %s already exists and is non-empty", repr(target)) 82 | return -1 83 | 84 | # compress/decompress 85 | if argv[1] == "c": 86 | compress(source, target) 87 | else: 88 | decompress(source, target) 89 | 90 | return 0 91 | 92 | if __name__ == "__main__": 93 | sys.exit(main(sys.argv)) 94 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/readme.md: -------------------------------------------------------------------------------- 1 | # Memscrimper C++ Implementation 2 | 3 | ## Dependencies 4 | - Boost Installation 5 | - libbz2.so (libz2.a for static build) 6 | - libz.so (libz.a for static build) 7 | 8 | ## Building 9 | ### Dynamically-Linked Version 10 | ```bash 11 | mkdir build_folder && cd build_folder 12 | cmake .. && make 13 | ``` 14 | ### Statically-Linked Version 15 | ```bash 16 | mkdir build_folder && cd build_folder 17 | cmake .. -DSTATIC_BUILD=ON && make 18 | ``` 19 | 20 | ## Example Usage 21 | Start the MemScrimper server as follows: 22 | ```bash 23 | ./memscrimper s /tmp/mscr.socket 24 | ``` 25 | Sending requests via Python module: 26 | ```python 27 | import mscr_client 28 | 29 | pagesize = 4096 30 | # creating client 31 | mscr = mscr_client.MscrClient("/tmp/mscr.socket", True) 32 | 33 | # adding reference dump (for faster compression) 34 | mscr.add_referencedump("path/to/refdump", pagesize) 35 | 36 | # send compression request 37 | mscr.compress_dump("path/to/refdump", "path/to/sourcedump", "dump.compress", 38 | pagesize, True, True, Compression.ZIP7) 39 | 40 | # send decompression request 41 | mscr.decompress_dump("dump.compress", "dump.uncompress") 42 | 43 | # remove reference dump from server memory (optional) 44 | mscr.del_referencedump("path/to/refdump") 45 | ``` 46 | 47 | ## Performance Comparison 48 | In the following we compared the different runtime using the Python and C++ implementation of MemScrimper. Note that "Ref. preloaded" is talking about preloading the corresponding Reference Dump. This is can be either done by the client member function `add_referencedump` or by processing multiple memory dumps for the same reference dump which is usually the case in production. 49 | 50 | Every Test consisted of three different memory dumps which were compressed using different options. 51 | 52 | All benchmark tests were performed on a computer using the following hardware: 53 | - CPU: Intel Core i5-6200u @ 2.30Ghz 54 | - Memory: 16 GB DDR4 2133 Mhz 55 | - Storage: SanDisk SD8SNAT-256G-1006 56 | 57 | ### Performance without Cache Flushing after every De-/Compression 58 | ![test](performance_graphs/mscr_compr_without_cacheflush.png) 59 | ![test](performance_graphs/mscr_decompr_without_cacheflush.png) 60 | #### Parameter Encoding 61 | 62 | ##### General Format: *X-Y-Z* 63 | 64 | | Position | 0 | 1 | 2 | 3 | 65 | |---------------|-----------------------------|---------------------------|--------------------------|----------------| 66 | | X | Intradeduplication not used | Intradeduplication used | / | / | 67 | | Y | Diffing not used | Intradeduplication used | / | / | 68 | | Z | Inner Compression not used | 7zip as inner compression |gzip as inner compression | bzip2 as inner compression| 69 | 70 | ### Performance with explicit Cache Flushing after every De-/Compression 71 | ![test](performance_graphs/mscr_compr_with_cacheflush.png) 72 | ![test](performance_graphs/mscr_decompr_without_cacheflush.png) 73 | 74 | #### Parameter Encoding 75 | 76 | ##### General Format: *X-Y-Z* 77 | 78 | | Position | 0 | 1 | 2 | 3 | 79 | |---------------|-----------------------------|---------------------------|--------------------------|----------------| 80 | | X | Intradeduplication not used | Intradeduplication used | / | / | 81 | | Y | Diffing not used | Intradeduplication used | / | / | 82 | | Z | Inner Compression not used | 7zip as inner compression |gzip as inner compression | bzip2 as inner compression| 83 | 84 | ### Performance Comparison between two times the same De-/Compression 85 | This test measures the impact of caches on our runtime. 86 | ![test](performance_graphs/mscr_compr_cache_measurement_combined.png) 87 | ![test](performance_graphs/mscr_decompr_cache_measurement_combined.png) 88 | 89 | #### Parameter Encoding 90 | 91 | ##### General Format: *X-Y-Z* 92 | 93 | | Position | 0 | 1 | 2 | 3 | 94 | |---------------|-----------------------------|---------------------------|--------------------------|----------------| 95 | | X | Intradeduplication not used | Intradeduplication used | / | / | 96 | | Y | Diffing not used | Intradeduplication used | / | / | 97 | | Z | Inner Compression not used | 7zip as inner compression |gzip as inner compression | bzip2 as inner compression| 98 | 99 | -------------------------------------------------------------------------------- /memscrimper_poc/compress_intradedup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import defaultdict as dd 5 | 6 | import argparse 7 | import gzip 8 | import util 9 | import sys 10 | import os 11 | import logging 12 | import shutil 13 | import struct 14 | import subprocess 15 | import tempfile 16 | 17 | import bz2file 18 | 19 | 20 | def compress(source, target, inner, pagesize=4096): 21 | # some info 22 | logging.debug("Starting compression of %s to %s", repr(source), repr(target)) 23 | logging.debug("Page size: %d", pagesize) 24 | 25 | # pages + page numbers bookkeeping 26 | pagenrs = dd(list) 27 | pages = [] 28 | for i, page in enumerate(util.get_pages(source, pagesize=pagesize)): 29 | pagenrs[page].append(i) 30 | pages.append(page) 31 | pages_set = set(pages) 32 | 33 | # remove pages which just occurr once from dictionary and intervalize values 34 | for page in pagenrs.keys(): 35 | if len(pagenrs[page]) == 1: 36 | del pagenrs[page] 37 | else: 38 | pagenrs[page] = util.intervalize(pagenrs[page]) 39 | 40 | # write file 41 | util.create_dir(".tmp") 42 | tmphandle, tmpfile = tempfile.mkstemp(dir=".tmp") 43 | try: 44 | with open(tmpfile, "wb") as ftmp: 45 | ftmp.write(struct.pack(" 0: 189 | logging.error("Target %s already exists and is non-empty", repr(args.target)) 190 | return -1 191 | 192 | # compress/decompress 193 | if args.action == "c": 194 | return compress(args.source, args.target, args.inner) 195 | elif args.action == "d": 196 | return decompress(args.source, args.target) 197 | 198 | if __name__ == "__main__": 199 | sys.exit(main()) 200 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/request_handler.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "request_handler.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "interdedup_compress.h" 17 | #include "interdedup_decompress.h" 18 | #include "utils.h" 19 | 20 | 21 | namespace mscr { 22 | 23 | request_handler::request_handler(uint32_t thread_count) : worker_threads_(thread_count) { 24 | BOOST_LOG_TRIVIAL(info) << "starting workerpool with " << thread_count 25 | << " threads"; 26 | } 27 | 28 | 29 | request_handler::~request_handler() { 30 | this->worker_threads_.join(); 31 | } 32 | 33 | 34 | int request_handler::handle_request(std::string msg) { 35 | // parse opcode 36 | auto opcode = static_cast(msg[0]); 37 | // we cut away opcode byte, because this will not be needed anymore 38 | msg.erase(0, 1); 39 | 40 | // dispatch 41 | switch (opcode) { 42 | case 0x00: 43 | BOOST_LOG_TRIVIAL(info) << "got request: add reference"; 44 | boost::asio::post(this->worker_threads_, 45 | boost::bind(&request_handler::add_reference, 46 | this, msg)); 47 | break; 48 | 49 | case 0x01: 50 | BOOST_LOG_TRIVIAL(info) << "got request: compress"; 51 | boost::asio::post(this->worker_threads_, 52 | boost::bind(&request_handler::compress_dump, 53 | this, msg)); 54 | break; 55 | 56 | case 0x02: 57 | BOOST_LOG_TRIVIAL(info) << "got request: decompress"; 58 | boost::asio::post(this->worker_threads_, 59 | boost::bind(&request_handler::decompress_dump, 60 | this, msg)); 61 | break; 62 | 63 | case 0x04: 64 | BOOST_LOG_TRIVIAL(info) << "got request: delete reference"; 65 | boost::asio::post(this->worker_threads_, 66 | boost::bind(&request_handler::del_reference, 67 | this, msg)); 68 | break; 69 | 70 | default: 71 | BOOST_LOG_TRIVIAL(error) << "got request: unknown opcode - received: " 72 | << std::to_string(opcode); 73 | return 1; 74 | } 75 | 76 | return 0; 77 | } 78 | 79 | 80 | int request_handler::compress_dump(std::string msg_str) { 81 | // parse reference path 82 | const char *msg = msg_str.c_str(); 83 | int msg_offset = 0; // skip first (len) and second (opcode) byte 84 | std::string ref_path(msg + msg_offset); 85 | BOOST_LOG_TRIVIAL(debug) << "ref_path: " << ref_path; 86 | msg_offset += ref_path.size() + 1; 87 | 88 | // parse source dump path 89 | std::string srcdump_path(msg + msg_offset); 90 | BOOST_LOG_TRIVIAL(debug) << "srcdump_path: " << srcdump_path; 91 | msg_offset += srcdump_path.size() + 1; 92 | 93 | // parse out file path 94 | std::string filename_out(msg + msg_offset); 95 | BOOST_LOG_TRIVIAL(debug) << "file_out: " << filename_out; 96 | msg_offset += filename_out.size() + 1; 97 | 98 | // parse page size 99 | uint32_t pagesize = read_num_LE(msg + msg_offset, 4); 100 | BOOST_LOG_TRIVIAL(debug) << "pagesize: " << pagesize; 101 | msg_offset += 4; 102 | 103 | // parse intra 104 | bool intra = msg[msg_offset] == '\x01'; 105 | BOOST_LOG_TRIVIAL(debug) << "intra: " << intra; 106 | msg_offset += 1; 107 | 108 | // parse diffing 109 | bool diffing = msg[msg_offset] == '\x01'; 110 | BOOST_LOG_TRIVIAL(debug) << "diffing: " << diffing; 111 | msg_offset += 1; 112 | 113 | // parse inner compression 114 | compression inner; 115 | switch (msg[msg_offset]) { 116 | case '\x00': 117 | inner = compression::ZIP7; 118 | BOOST_LOG_TRIVIAL(debug) << "inner: zip7"; 119 | break; 120 | 121 | case '\x01': 122 | inner = compression::GZIP; 123 | BOOST_LOG_TRIVIAL(debug) << "inner: gzip"; 124 | break; 125 | 126 | case '\x02': 127 | inner = compression::BZIP2; 128 | BOOST_LOG_TRIVIAL(debug) << "inner: bzip2"; 129 | break; 130 | 131 | case '\x03': 132 | inner = compression ::NOINNER; 133 | BOOST_LOG_TRIVIAL(debug) << "inner: noinner"; 134 | break; 135 | 136 | default: 137 | BOOST_LOG_TRIVIAL(error) << "invalid inner compression method"; 138 | return 1; 139 | } 140 | 141 | // retrieve reference dump 142 | std::shared_ptr refdump = get_refdump(ref_path, pagesize); 143 | if (refdump == nullptr) { 144 | return 1; 145 | } 146 | 147 | // parse source dump 148 | memdump srcdump(srcdump_path); 149 | int ret = srcdump.readDumpfile(pagesize); 150 | if (ret != 0) { 151 | BOOST_LOG_TRIVIAL(error) << "error reading srcdump"; 152 | return 1; 153 | } 154 | 155 | // compress 156 | interdedup_compress(refdump, srcdump, filename_out.c_str(), 157 | inner, diffing, intra); 158 | 159 | return 0; 160 | } 161 | 162 | 163 | int request_handler::decompress_dump(std::string msg_str) { 164 | // parse source dump 165 | const char *msg = msg_str.c_str(); 166 | int msg_offset = 0; 167 | std::string dump_path(msg + msg_offset); 168 | msg_offset += dump_path.size() + 1; 169 | 170 | // parse out path 171 | std::string out_path(msg + msg_offset); 172 | 173 | // decompress 174 | interdedup_decompress(this, dump_path.c_str(), out_path.c_str()); 175 | 176 | return 0; 177 | } 178 | 179 | 180 | int request_handler::add_reference(std::string msg_str) { 181 | // parse reference dump path 182 | const char *msg = msg_str.c_str(); 183 | int msg_offset = 0; 184 | std::string ref_path(msg + msg_offset); 185 | BOOST_LOG_TRIVIAL(debug) << "ref_path: " << ref_path; 186 | msg_offset += ref_path.size() + 1; 187 | 188 | // parse page size 189 | auto pagesize = static_cast(read_num_LE(msg + msg_offset, 4)); 190 | BOOST_LOG_TRIVIAL(debug) << "pagesize: " << pagesize; 191 | 192 | // parse reference dump 193 | std::shared_ptr refdump = std::make_shared(ref_path); 194 | int ret = refdump->readDumpfile(pagesize); 195 | if (ret != 0) { 196 | BOOST_LOG_TRIVIAL(error) << "error reading dumpfile"; 197 | return 1; 198 | } 199 | 200 | // replace dump if necessary 201 | std::lock_guard m_lock(this->mu_); 202 | int pos = find_refdump(ref_path); 203 | if (pos != -1) { 204 | this->refdumps_.erase(this->refdumps_.begin() + pos); 205 | } 206 | 207 | // add dump 208 | this->refdumps_.push_back(std::move(refdump)); 209 | BOOST_LOG_TRIVIAL(debug) << "added refdump (number of saved refdumps: " 210 | << this->refdumps_.size() << ")"; 211 | 212 | return 0; 213 | } 214 | 215 | 216 | int request_handler::del_reference(std::string msg_str) { 217 | // parse reference path 218 | const char *msg = msg_str.c_str(); 219 | int msg_offset = 0; 220 | std::string ref_path(msg + msg_offset); 221 | 222 | // remove dump 223 | std::lock_guard m_lock(this->mu_); 224 | int pos = find_refdump(ref_path); 225 | if (pos != -1) { 226 | this->refdumps_.erase(this->refdumps_.begin() + pos); 227 | } 228 | BOOST_LOG_TRIVIAL(debug) << "removed refdump (number of saved refdumps: " 229 | << refdumps_.size() << ")"; 230 | 231 | return 0; 232 | } 233 | 234 | 235 | std::shared_ptr request_handler::get_refdump(std::string path, 236 | uint32_t pagesize) { 237 | { 238 | std::lock_guard m_lock(this->mu_); 239 | int pos = find_refdump(path); 240 | if (pos != -1) { 241 | // we already have the dump - just return it 242 | BOOST_LOG_TRIVIAL(debug) << "refdump already loaded"; 243 | return this->refdumps_[pos]; 244 | } 245 | } // destroy lock_guard again 246 | 247 | // parse the dump 248 | std::shared_ptr dump = std::make_shared(path); 249 | int ret = dump->readDumpfile(pagesize); 250 | if (ret != 0) { 251 | BOOST_LOG_TRIVIAL(error) << "error reading refdump"; 252 | return nullptr; 253 | } 254 | 255 | // double check if the dump is added meanwhile to prevent data races 256 | std::lock_guard m_lock(this->mu_); 257 | if (find_refdump(path) == -1) { 258 | this->refdumps_.push_back(dump); 259 | } 260 | int num_dumps = this->refdumps_.size(); 261 | BOOST_LOG_TRIVIAL(debug) << "added refdump (number of saved refdumps: " 262 | << num_dumps << ")"; 263 | return dump; 264 | } 265 | 266 | 267 | int request_handler::find_refdump(const std::string &path) { 268 | std::lock_guard m_lock(this->mu_); 269 | for (uint32_t i = 0; i < this->refdumps_.size(); i++) { 270 | std::string curr_path = (this->refdumps_[i])->getPath(); 271 | if (curr_path == path) { 272 | return i; 273 | } 274 | } 275 | return -1; 276 | } 277 | 278 | } // namespace mscr 279 | -------------------------------------------------------------------------------- /memscrimper_poc/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | from logging.handlers import RotatingFileHandler 5 | 6 | import cStringIO 7 | import errno 8 | import logging 9 | import os 10 | import struct 11 | import sys 12 | 13 | import colorama 14 | 15 | 16 | def create_dir(path): 17 | try: 18 | os.makedirs(path) 19 | except OSError as e: 20 | if e.errno != errno.EEXIST: 21 | raise 22 | 23 | 24 | def create_header(method, uncompressed_size, majorversion=1, minorversion=1, magic_number="MBCR", pagesize=4096): 25 | ret = "" 26 | ret += magic_number + "\x00" 27 | ret += method + "\x00" 28 | ret += struct.pack(" 0: 51 | ret += 1 52 | n >>= 1 53 | return ret 54 | 55 | 56 | def diff(page1, page2): 57 | pagesize = len(page1) 58 | ret = [] 59 | intermediate = bytearray() 60 | previ = None 61 | for i in xrange(pagesize): 62 | first, second = page1[i], page2[i] 63 | if first == second: 64 | intermediate.append(second) 65 | elif first != second: 66 | if len(intermediate) <= 2 and previ is not None: 67 | curr[1] += intermediate 68 | curr[1].append(second) 69 | else: 70 | if previ is None: 71 | curr = [i, bytearray()] 72 | else: 73 | curr = [i - previ - len(ret[-1][1]), bytearray()] 74 | previ = i 75 | ret.append(curr) 76 | curr[1].append(second) 77 | intermediate = bytearray() 78 | fixed = [] 79 | for i, (a, b) in enumerate(ret): 80 | if len(b) > 2048: 81 | overhead = len(b) - 2048 82 | la, lb = a, b[:overhead] 83 | ra, rb = 0, b[overhead:] 84 | fixed += [(la, lb), (ra, rb)] 85 | else: 86 | fixed.append((a, b)) 87 | return fixed 88 | 89 | 90 | def apply_diff(page, d): 91 | ret = bytearray(page) 92 | offset = 0 93 | for (rel, bs) in d: 94 | offset += rel 95 | for i in xrange(len(bs)): 96 | ret[offset+i] = bs[i] 97 | offset += len(bs) 98 | return str(ret) 99 | 100 | 101 | def create_diff(page1, page2): 102 | def encode(rel, sz): 103 | ret = "" 104 | sz = sz - 1 105 | if rel < 128 and sz < 128: 106 | ret += struct.pack("BB", sz, rel) 107 | else: 108 | blop = (sz << 12) | rel 109 | a = (blop & 0xFF0000) >> 16 110 | a |= 128 111 | b = (blop & 0xFF00) >> 8 112 | c = blop & 0xFF 113 | ret = struct.pack("BBB", a, b, c) 114 | return ret 115 | pagesize = len(page1) 116 | ret = "" 117 | num = 0 118 | for rel, bs in diff(page1, page2): 119 | num += 1 120 | ret += encode(rel, len(bs)) 121 | ret += str(bs) 122 | if len(ret) + 2 >= pagesize: 123 | return None 124 | return struct.pack("> 12 135 | else: 136 | return b, a 137 | ret = [] 138 | for _ in xrange(parse_int(f, 2)): 139 | rel, sz = decode(f) 140 | ret.append((rel, f.read(sz+1))) 141 | return ret 142 | 143 | 144 | def create_pagenr_list(pagenrs, f): 145 | f.write(struct.pack("B", curr | 128)) 154 | else: 155 | f.write(struct.pack(">I", curr)) 156 | prev = pagenr 157 | 158 | 159 | def parse_pagenr_list(f): 160 | n = parse_int(f, 4) 161 | prev = None 162 | for _ in xrange(n): 163 | a = parse_int(f, 1) 164 | if a & 128 == 128: 165 | a &= 127 166 | else: 167 | b = parse_int(f, 1) 168 | c = parse_int(f, 1) 169 | d = parse_int(f, 1) 170 | a = (a << 24) | (b << 16) | (c << 8) | d 171 | if prev is None: 172 | yield a 173 | prev = a 174 | else: 175 | yield prev + a + 1 176 | prev = prev + a + 1 177 | 178 | 179 | def create_interval(left, right, last=False): 180 | assert left < 1 << 29 181 | if last: 182 | last = 4 # 0b100 183 | else: 184 | last = 0 # 0b000 185 | if left == right: 186 | return struct.pack("> 29 211 | sz = upper & 3 212 | last = (upper >> 2) == 1 213 | if sz == 3: 214 | sz = 4 215 | left &= (1 << 29) - 1 216 | if sz == 0: 217 | delta = 0 218 | elif sz in [1, 2, 4]: 219 | delta = parse_int(f, sz) 220 | else: 221 | logging.error("Parsing error: interval size %d is not in [1, 2, 4]", sz) 222 | sys.exit(-1) 223 | return last, left, left + delta 224 | 225 | 226 | def parse_interval_list(f): 227 | last = False 228 | while not last: 229 | last, left, right = parse_interval(f) 230 | yield left, right 231 | 232 | 233 | # [1, 2, 3, 5, 6, 7, 8, 9, 13] -> [[1, 3], [5, 9], [13, 13]] 234 | def intervalize(l): 235 | if len(l) == 0: 236 | return [] 237 | ret = [] 238 | curr = [l[0], l[0]] 239 | for x in l[1:]: 240 | if curr[1] + 1 == x: 241 | curr[1] = x 242 | else: 243 | ret.append(curr) 244 | curr = [x, x] 245 | ret.append(curr) 246 | return ret 247 | 248 | 249 | def parse_int(f, sz): 250 | szmap = { 1: "B", 2:"H", 4:"I", 8: "Q" } 251 | return struct.unpack("<{}".format(szmap[sz]), f.read(sz))[0] 252 | 253 | 254 | def parse_string(f): 255 | ret = "" 256 | while True: 257 | c = f.read(1) 258 | if c == "\x00": 259 | return ret 260 | ret += c 261 | 262 | 263 | def configure_logging(identifier, logfile): 264 | # enable cross-platform colored output 265 | colorama.init() 266 | 267 | # get the root logger and make it verbose 268 | logger = logging.getLogger() 269 | logger.setLevel(logging.DEBUG) 270 | 271 | # this allows us to set an upper threshold for the log levels since the 272 | # setLevel method only sets a lower one 273 | class UpperThresholdFilter(logging.Filter): 274 | def __init__(self, threshold, *args, **kwargs): 275 | self._threshold = threshold 276 | super(UpperThresholdFilter, self).__init__(*args, **kwargs) 277 | 278 | def filter(self, rec): 279 | return rec.levelno <= self._threshold 280 | 281 | # use colored output and use different colors for different levels 282 | class ColorFormatter(logging.Formatter): 283 | def __init__(self, colorfmt, *args, **kwargs): 284 | self._colorfmt = colorfmt 285 | super(ColorFormatter, self).__init__(*args, **kwargs) 286 | 287 | def format(self, record): 288 | if record.levelno == logging.INFO: 289 | color = colorama.Fore.GREEN 290 | elif record.levelno == logging.WARNING: 291 | color = colorama.Fore.YELLOW 292 | elif record.levelno == logging.ERROR: 293 | color = colorama.Fore.RED 294 | elif record.levelno == logging.DEBUG: 295 | color = colorama.Fore.CYAN 296 | else: 297 | color = "" 298 | self._fmt = self._colorfmt.format(color, colorama.Style.RESET_ALL) 299 | return logging.Formatter.format(self, record) 300 | 301 | # configure formatter 302 | logfmt = "{{}}[%(asctime)s|{}|%(levelname).3s]{{}} %(message)s".format(identifier) 303 | formatter = ColorFormatter(logfmt) 304 | 305 | # configure stdout handler 306 | stdouthandler = logging.StreamHandler(sys.stdout) 307 | stdouthandler.setLevel(logging.DEBUG) 308 | stdouthandler.addFilter(UpperThresholdFilter(logging.INFO)) 309 | stdouthandler.setFormatter(formatter) 310 | logger.addHandler(stdouthandler) 311 | 312 | # configure stderr handler 313 | stderrhandler = logging.StreamHandler(sys.stderr) 314 | stderrhandler.setLevel(logging.WARNING) 315 | stderrhandler.setFormatter(formatter) 316 | logger.addHandler(stderrhandler) 317 | 318 | # configure file handler (no colored messages here) 319 | filehandler = RotatingFileHandler(logfile, maxBytes=1024 * 1024 * 100, backupCount=5) 320 | filehandler.setLevel(logging.DEBUG) 321 | filehandler.setFormatter(logging.Formatter(logfmt.format("", ""))) 322 | logger.addHandler(filehandler) 323 | 324 | 325 | def get_pages(path, pagesize=4096): 326 | with open(path, "rb") as f: 327 | while True: 328 | page = f.read(pagesize) 329 | if not page: 330 | break 331 | yield page 332 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/socket_api.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "socket_api.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #include "request_handler.h" 23 | #include "utils.h" 24 | 25 | namespace mscr { 26 | 27 | // timeout in ms for epoll_wait 28 | constexpr int kTimeoutEpoll = 5000; 29 | 30 | // size of epoll queue 31 | constexpr int kEpollQueueSize = 255; 32 | // number of EPOLL-circles before timeout 33 | constexpr int kTimeoutForClients = 1000; 34 | 35 | // size of main socket backlog 36 | constexpr int kBacklogSize = 10; 37 | 38 | bool command_socket::shutdown_(false); 39 | 40 | command_socket::command_socket(const std::string &sock_path, request_handler* handler) 41 | : handler_(handler), sock_path_(sock_path), 42 | srv_sock_(socket(AF_LOCAL, SOCK_STREAM, 0)) { 43 | // initialize socket 44 | this->srv_addr_.sun_family = AF_LOCAL; 45 | snprintf(this->srv_addr_.sun_path, sizeof(this->srv_addr_.sun_path), "%s", 46 | this->sock_path_.c_str()); 47 | 48 | // destroy socket file if it exists 49 | unlink(this->sock_path_.c_str()); 50 | 51 | // make socket non-blocking 52 | int saved_flags = fcntl(this->srv_sock_, F_GETFL); 53 | if (saved_flags < 0) { 54 | BOOST_LOG_TRIVIAL(error) << "error getting socket flags on main socket"; 55 | close(this->srv_sock_); 56 | throw std::runtime_error("error setting socket options"); 57 | } 58 | saved_flags |= O_NONBLOCK; 59 | int ret = fcntl(this->srv_sock_, F_SETFL, saved_flags); 60 | if (ret < 0) { 61 | BOOST_LOG_TRIVIAL(error) << "error switching main socket to non-blocking"; 62 | close(this->srv_sock_); 63 | throw std::runtime_error("error setting socket options"); 64 | } 65 | 66 | // bind socket 67 | ret = bind(this->srv_sock_, 68 | reinterpret_cast(&(this->srv_addr_)), 69 | SUN_LEN(&(this->srv_addr_))); 70 | if (ret < 0) { 71 | BOOST_LOG_TRIVIAL(error) << "failed to bind socket"; 72 | close(this->srv_sock_); 73 | throw std::runtime_error("error binding socket"); 74 | } 75 | 76 | // enable signal handlers 77 | signal(SIGINT, reinterpret_cast(sig_handler)); 78 | signal(SIGTERM, reinterpret_cast(sig_handler)); 79 | } 80 | 81 | 82 | command_socket::~command_socket() { 83 | // close main socket 84 | close(this->srv_sock_); 85 | 86 | // close all other open sockets 87 | for (int sock : this->open_socks_) { 88 | BOOST_LOG_TRIVIAL(info) << "closing fd " << sock; 89 | close(sock); 90 | } 91 | 92 | // destroy socket file 93 | BOOST_LOG_TRIVIAL(info) << "deleting socketfile"; 94 | unlink(this->sock_path_.c_str()); 95 | } 96 | 97 | 98 | void command_socket::sig_handler(int) { 99 | /* this will be triggered if SIGNALS are caught and shutdown = true will 100 | * cause actual closing logic to happen */ 101 | shutdown_ = true; 102 | } 103 | 104 | 105 | void command_socket::start_listen() { 106 | // start listening 107 | int ret = listen(this->srv_sock_, kBacklogSize); 108 | if (ret) { 109 | BOOST_LOG_TRIVIAL(error) << "error on listen call to main socket"; 110 | return; 111 | } 112 | 113 | struct epoll_event events[kEpollQueueSize]; 114 | int epfd = epoll_create(kEpollQueueSize); 115 | this->open_socks_.insert(epfd); 116 | static struct epoll_event ev; 117 | 118 | // add main socket 119 | ev.events = EPOLLIN | EPOLLPRI | EPOLLERR | EPOLLHUP; 120 | ev.data.fd = this->srv_sock_; 121 | int res = epoll_ctl(epfd, EPOLL_CTL_ADD, this->srv_sock_, &ev); 122 | if (res && !shutdown_) { 123 | BOOST_LOG_TRIVIAL(error) << "Epoll error when adding main socket"; 124 | return; 125 | } 126 | this->open_socks_.insert(this->srv_sock_); 127 | 128 | // dispatch loop 129 | while (!shutdown_) { 130 | // handle timeouts 131 | for (auto it = this->timeout_counters_.begin(); 132 | it != this->timeout_counters_.end();) { 133 | int sock = it->first; 134 | int curr_timeout = it->second; 135 | 136 | // check if client exceeded timeout 137 | if (curr_timeout >= kTimeoutForClients) { 138 | BOOST_LOG_TRIVIAL(info) << "Client " << sock << " exceeded timeout"; 139 | close(sock); 140 | this->open_socks_.erase(sock); 141 | this->timeout_counters_.erase(it++); 142 | } else { 143 | // increase timeout counter 144 | this->timeout_counters_[sock] += 1; 145 | it++; 146 | } 147 | } 148 | 149 | // wait for event 150 | int num_rdy = epoll_wait(epfd, events, 10, kTimeoutEpoll); 151 | if (num_rdy < 0 && !shutdown_) { 152 | if (errno == EINTR) { 153 | // we were killed by an interrupt (e.g., system() fork) 154 | BOOST_LOG_TRIVIAL(warning) << "caught interrupt on epoll_wait"; 155 | continue; 156 | } else { 157 | // we got an unexpected error 158 | BOOST_LOG_TRIVIAL(error) << "unexpected error on epoll_wait: " 159 | << strerror(errno); 160 | 161 | // note that this will close the listener 162 | return; 163 | } 164 | } 165 | 166 | // process all events 167 | for (int i = 0; i < num_rdy; i++) { 168 | // get corresponding socket file descriptor holding the event 169 | int sock = events[i].data.fd; 170 | BOOST_LOG_TRIVIAL(debug) << "handling socket " << std::to_string(sock); 171 | 172 | if (sock == this->srv_sock_) { 173 | // if we're on the main socket, we have to add a new connection 174 | struct sockaddr_un cl_addr; 175 | memset(&cl_addr, '\0', sizeof(cl_addr)); 176 | socklen_t cl_len; 177 | memset(&cl_len, '\0', sizeof(cl_len)); 178 | 179 | // accept new client connection 180 | int cl_sock = accept(this->srv_sock_, 181 | reinterpret_cast(&cl_addr), &cl_len); 182 | if (cl_sock < 0) { 183 | if (errno == EAGAIN || errno == EWOULDBLOCK) { 184 | } else { 185 | BOOST_LOG_TRIVIAL(error) << "error accepting client connection"; 186 | } 187 | continue; 188 | } 189 | 190 | // make socket non-blocking 191 | int saved_flags = fcntl(cl_sock, F_GETFL); 192 | if (saved_flags < 0) { 193 | BOOST_LOG_TRIVIAL(error) << "error getting socket flags"; 194 | close(cl_sock); 195 | continue; 196 | } 197 | saved_flags |= O_NONBLOCK; 198 | int ret = fcntl(cl_sock, F_SETFL, saved_flags); 199 | if (ret < 0) { 200 | BOOST_LOG_TRIVIAL(error) << "error setting socket to non-blocking"; 201 | close(cl_sock); 202 | continue; 203 | } 204 | 205 | // add socket to epoll set 206 | ev.data.fd = cl_sock; 207 | ret = epoll_ctl(epfd, EPOLL_CTL_ADD, cl_sock, &ev); 208 | if (ret < 0) { 209 | BOOST_LOG_TRIVIAL(error) << "error adding new client to epoll"; 210 | close(cl_sock); 211 | continue; 212 | } 213 | this->open_socks_.insert(cl_sock); 214 | 215 | // initialize timeout counter 216 | this->timeout_counters_[cl_sock] = 0; 217 | } else { 218 | // reset timeout counter 219 | this->timeout_counters_[sock] = 0; 220 | 221 | // handle client connection 222 | handle_client_connection(sock, epfd, &ev); 223 | } 224 | } 225 | } 226 | BOOST_LOG_TRIVIAL(info) << "shutting down"; 227 | } 228 | 229 | 230 | void command_socket::handle_client_connection(int cl_sock, int epfd, struct epoll_event* ev) { 231 | // store read bytes here -- we know that no message exceeds 2048B payload + 1 Byte size 232 | char read_buf[2049] = { 0 }; 233 | 234 | // read first byte 235 | int read_byte = read(cl_sock, read_buf, 1); 236 | 237 | if (read_byte <= 0) { 238 | if (errno == EAGAIN || errno == EWOULDBLOCK) { 239 | // socket would block -> ignore the request and try again later 240 | return; 241 | } 242 | 243 | /* peer closed socket or something else failed - bye bye 244 | * at first remove fd from epoll set (before closing - otherwise delete from epoll set won't 245 | * work) 246 | * this is important because forks from bp::child(utils.cpp) can still hold the fd */ 247 | epoll_ctl(epfd, EPOLL_CTL_DEL, cl_sock, nullptr); 248 | BOOST_LOG_TRIVIAL(debug) << "closing fd " << cl_sock; 249 | int ret = close(cl_sock); 250 | 251 | // check if we closed the socket successfully 252 | if (ret) { 253 | BOOST_LOG_TRIVIAL(error) << "error closing fd " << cl_sock << "err: " << strerror(errno); 254 | // we failed on closing so we add the fd back to our epoll set 255 | ev->data.fd = cl_sock; 256 | epoll_ctl(epfd, EPOLL_CTL_ADD, cl_sock, ev); 257 | } else { 258 | // remove closed socket from management lists 259 | this->timeout_counters_.erase(cl_sock); 260 | this->open_socks_.erase(cl_sock); 261 | } 262 | } else { 263 | // first byte encodes our message length 264 | uint32_t msglen = static_cast(read_buf[0]) * 8; 265 | 266 | // read rest of the message 267 | uint32_t read_bytes = read(cl_sock, read_buf + 1, msglen); 268 | std::string msg(read_buf, read_bytes + 1); 269 | auto msgid = static_cast(read_buf[1]); 270 | 271 | // sanity check 272 | if (read_bytes != msglen) { 273 | BOOST_LOG_TRIVIAL(warning) << "received broken or incomplete message. " 274 | << "expected " << msglen << "B - read: " << read_bytes << "B." 275 | << "Aborting Request."; 276 | // pack MSG_ID 277 | std::vector cl_ack = int_to_byte_LE(msgid, 1); // msg id 278 | // append failure byte (0x0) 279 | cl_ack.push_back((int_to_byte_LE(0, 1)).front()); 280 | // send failed ACK to client 281 | send(cl_sock, cl_ack.data(), 2, 0); 282 | } else { 283 | // pack MSG_ID 284 | std::vector cl_ack = int_to_byte_LE(msgid, 1); 285 | // append success byte (0x1) 286 | cl_ack.push_back((int_to_byte_LE(1, 1)).front()); 287 | // send ACK to client 288 | send(cl_sock, cl_ack.data(), 2, 0); 289 | /* cast to string so we can pass by value 290 | * read_buf +2 will cut of MSG_LEN and MSG_ID (only needed for networking) */ 291 | std::string msg(read_buf + 2 , read_bytes - 1); 292 | 293 | // request handler will process the request 294 | this->handler_->handle_request(msg); 295 | } 296 | } 297 | } 298 | 299 | } // namespace mscr 300 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/mscr_client_python2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python2 2 | # Copyright [2019] 3 | 4 | import socket 5 | import struct 6 | import enum 7 | import math 8 | import time 9 | import collections 10 | 11 | 12 | class Compression(enum.Enum): 13 | ZIP7 = 0 14 | GZIP = 1 15 | BZIP2 = 2 16 | NOINNER = 3 17 | 18 | 19 | class MscrClient: 20 | _sock_path = "" # holds socket path 21 | _cl_sock = "" # holds client socket 22 | _await_answer = True # bool to check if we should wait for answer from service 23 | _ack_buffer = collections.deque(maxlen=256) # used as a ring buffer 24 | _messageid = 0 25 | _sending_attempts = 0 26 | _verbose = False 27 | 28 | def __init__(self, sock_path, await_answer, sending_attempts=3, verbose=False): 29 | """ 30 | :param sock_path: string 31 | path to memscrimper socketfile 32 | :param await_answer: bool 33 | Block while waiting for command acknowledge 34 | :param sending_attempts: how often the client tries to resend a command on failure 35 | :param verbose: enable debug prints 36 | """ 37 | self._sock_path = sock_path 38 | self._cl_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 39 | self._cl_sock.connect(sock_path) 40 | self._cl_sock.settimeout(15) # set timeout for all operations 41 | self._await_answer = await_answer 42 | self._sending_attempts = sending_attempts 43 | self._verbose = verbose 44 | 45 | def __del__(self): 46 | self._cl_sock.close() 47 | 48 | # Message-format: LEN(1B) | MSGID(1B) | OPCODE(1B) | OPTIONS(XB) 49 | # LEN will be denoted as x8. and we use nullbytes as padding after the last option 50 | # the LEN-byte itself will not be part of the amount in len 51 | 52 | def add_referencedump(self, reference_path, pagesize): 53 | """ 54 | OPCODE: 0x00 55 | :param reference_path: string 56 | path to reference dump 57 | :param pagesize: int 58 | pagesize of reference dump 59 | :return: True : success - False: failure (only if used in blocking mode) 60 | """ 61 | 62 | msg_len_nopad = 6 # +1 for msgid / +1 for opcode / +4 for ps 63 | msg_len_nopad += len(reference_path) + 1 # +1 for nullbyte 64 | pad_len = self._calc_padding(msg_len_nopad) 65 | msg = struct.pack("B", math.floor((msg_len_nopad + pad_len) / 8)) 66 | self._debug_print("LEN-byte: %d" % math.floor((msg_len_nopad + pad_len) / 8)) 67 | 68 | msgid = self._get_message_id() 69 | msg += struct.pack("B", msgid) # MSGID 70 | 71 | msg += '\x00' # OPCODE 72 | 73 | msg += str.encode(reference_path) 74 | msg += '\0' 75 | 76 | msg += struct.pack(" 3 | 4 | import socket 5 | import struct 6 | import enum 7 | import math 8 | import time 9 | import collections 10 | 11 | 12 | class Compression(enum.Enum): 13 | ZIP7 = 0 14 | GZIP = 1 15 | BZIP2 = 2 16 | NOINNER = 3 17 | 18 | 19 | class MscrClient: 20 | _sock_path = "" # holds socket path 21 | _cl_sock = "" # holds client socket 22 | _await_answer = True # bool to check if we should wait for answer from service 23 | _ack_buffer = collections.deque(maxlen=256) # used as a ring buffer 24 | _messageid = 0 25 | _sending_attempts = 0 26 | _verbose = False 27 | 28 | def __init__(self, sock_path, await_answer, sending_attempts=3, verbose=False): 29 | """ 30 | :param sock_path: string 31 | path to memscrimper socketfile 32 | :param await_answer: bool 33 | Block while waiting for command acknowledge 34 | :param sending_attempts: how often the client tries to resend a command on failure 35 | :param verbose: enable debug prints 36 | """ 37 | self._sock_path = sock_path 38 | self._cl_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 39 | self._cl_sock.connect(sock_path) 40 | self._cl_sock.settimeout(15) # set timeout for all operations 41 | self._await_answer = await_answer 42 | self._sending_attempts = sending_attempts 43 | self._verbose = verbose 44 | 45 | def __del__(self): 46 | self._cl_sock.close() 47 | 48 | # Message-format: LEN(1B) | MSGID(1B) | OPCODE(1B) | OPTIONS(XB) 49 | # LEN will be denoted as x8. and we use nullbytes as padding after the last option 50 | # the LEN-byte itself will not be part of the amount in len 51 | 52 | def add_referencedump(self, reference_path, pagesize): 53 | """ 54 | OPCODE: 0x00 55 | :param reference_path: string 56 | path to reference dump 57 | :param pagesize: int 58 | pagesize of reference dump 59 | :return: True : success - False: failure (only if used in blocking mode) 60 | """ 61 | 62 | msg_len_nopad = 6 # +1 for msgid / +1 for opcode / +4 for ps 63 | msg_len_nopad += len(reference_path) + 1 # +1 for nullbyte 64 | pad_len = self._calc_padding(msg_len_nopad) 65 | msg = struct.pack("B", math.floor((msg_len_nopad + pad_len) / 8)) 66 | self._debug_print("LEN-byte: %d" % math.floor((msg_len_nopad + pad_len) / 8)) 67 | 68 | msgid = self._get_message_id() 69 | msg += struct.pack("B", (msgid)) # MSGID 70 | 71 | msg += b'\x00' # OPCODE 72 | 73 | msg += str.encode(reference_path) 74 | msg += b'\0' 75 | 76 | msg += struct.pack(" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "interdedup_compress.h" 20 | #include "interdedup_decompress.h" 21 | #include "request_handler.h" 22 | #include "socket_api.h" 23 | #include "memdump.h" 24 | 25 | 26 | #ifndef DEBUGMODE 27 | #define DEBUGMODE 0 28 | #endif 29 | 30 | typedef boost::log::sinks::synchronous_sink FileSink; 31 | 32 | template 33 | void log_fmt(boost::log::record_view const& rec, 34 | boost::log::formatting_ostream& stream) { 35 | namespace log = boost::log; 36 | namespace ptime = boost::posix_time; 37 | 38 | // add color on terminal 39 | auto severity = rec[log::trivial::severity]; 40 | if (isconsole && severity) { 41 | switch (severity.get()) { 42 | case log::trivial::debug: 43 | stream << "\033[36m"; 44 | break; 45 | case log::trivial::info: 46 | stream << "\033[32m"; 47 | break; 48 | case log::trivial::warning: 49 | stream << "\033[33m"; 50 | break; 51 | case log::trivial::error: 52 | stream << "\033[31m"; 53 | break; 54 | default: 55 | break; 56 | } 57 | } 58 | 59 | // extract timestamp + thread id 60 | auto timestamp = log::extract("TimeStamp", rec); 61 | auto tid = log::extract("ThreadID", rec); 62 | 63 | // format timestamp 64 | ptime::time_facet *df = new ptime::time_facet("%Y-%m-%d %H:%M:%S%F"); 65 | stream.imbue(std::locale(std::locale::classic(), df)); 66 | 67 | // format actual message 68 | stream << "[" << timestamp << "|"; 69 | const char *str = log::trivial::to_string(severity.get()); 70 | for (int i = 0; i < 3; i++) { 71 | stream << static_cast(std::toupper(str[i])); 72 | } 73 | stream << "][" << tid << "] "; 74 | 75 | // change color back to normal if necessary 76 | if (isconsole && severity) { 77 | stream << "\033[0m"; 78 | } 79 | 80 | // print actual message 81 | stream << rec[log::expressions::smessage]; 82 | } 83 | 84 | 85 | void init_file_collecting(boost::shared_ptr sink, const uint32_t logsize, 86 | const uint32_t number_diff_logfiles) { 87 | // configure logfile rotation 88 | namespace keywords = boost::log::keywords; 89 | namespace sinks = boost::log::sinks; 90 | sink->locked_backend()->set_file_collector(sinks::file::make_collector( 91 | keywords::target = "logs", // store logfiles in folder named "logs" 92 | keywords::max_size = logsize * number_diff_logfiles, 93 | keywords::max_files = number_diff_logfiles)); 94 | } 95 | 96 | 97 | void init_logging() { 98 | namespace log = boost::log; 99 | namespace sinks = log::sinks; 100 | namespace trivial = log::trivial; 101 | namespace keywords = boost::log::keywords; 102 | 103 | typedef sinks::synchronous_sink text_sink; 104 | const uint32_t logsize = 1 * 1024 * 1024; 105 | const uint32_t number_diff_logfiles = 10; 106 | 107 | // add things like timestamp, threadid, etc... + severity 108 | log::add_common_attributes(); 109 | log::register_simple_formatter_factory( 110 | "Severity"); 111 | 112 | // create file sink 113 | auto fsink = boost::make_shared( 114 | keywords::file_name = "memscrimper_%Y-%m-%d_%H-%M-%S.log", 115 | keywords::target = "logs", 116 | keywords::rotation_size = logsize, 117 | keywords::auto_flush = true, 118 | keywords::enable_final_rotation = true, // move the last log file to log-folder 119 | keywords::open_mode = std::ios_base::out | std::ios_base::app); 120 | 121 | init_file_collecting(fsink, logsize, number_diff_logfiles); 122 | fsink->locked_backend()->scan_for_files(); 123 | fsink->set_formatter(&log_fmt); 124 | log::core::get()->add_sink(fsink); 125 | 126 | // create stderr sink 127 | auto sink = boost::make_shared(); 128 | boost::shared_ptr stream(&std::cerr, boost::null_deleter()); 129 | sink->locked_backend()->add_stream(stream); 130 | sink->set_filter(trivial::severity >= trivial::warning); 131 | sink->set_formatter(&log_fmt); 132 | log::core::get()->add_sink(sink); 133 | 134 | // create stdout sink 135 | sink = boost::make_shared(); 136 | stream = boost::shared_ptr(&std::cout, boost::null_deleter()); 137 | sink->locked_backend()->add_stream(stream); 138 | if (DEBUGMODE) { 139 | sink->set_filter(trivial::severity < trivial::warning); 140 | } else { 141 | sink->set_filter(trivial::severity == trivial::info); 142 | } 143 | sink->set_formatter(&log_fmt); 144 | log::core::get()->add_sink(sink); 145 | } 146 | 147 | 148 | void print_help(const std::string &program_name) { 149 | std::cout << "GENERAL USAGE: \t\t" << program_name 150 | << " [-h| ]\n" 151 | << "-------------------------------------------------------------\n" 152 | << "COMPRESS: \t\t" << program_name 153 | << " c \n" 154 | << "\t\t\t \n" 155 | << "DECOMPRESS: \t\t" << program_name 156 | << " d \n" 157 | << "START AS A SERVICE: \t" << program_name << " s " 158 | << "\n" 159 | << "-------------------------------------------------------------\n" 160 | << "Valid inner compression methods: \n" 161 | << "'gzip': \tGZIP compression (requires utility gzip/gunzip)\n" 162 | << "'bzip2': \tBZIP2 compression (requires utility bzip2/bunzip2)\n" 163 | << "'7zip': \t7ZIP compression (requires utility 7za)\n" 164 | << "'0': \t\tdisables inner compression\n" 165 | << "\nValid values for intra/diffing:\n" 166 | << "'0': \tdisabled intra/diffing\n" 167 | << "'1': \tenables intra/diffing" << std::endl; 168 | } 169 | 170 | 171 | int main(int argc, char *argv[]) { 172 | init_logging(); 173 | 174 | // print usage if necessary 175 | if (argc < 2 || !std::strcmp(argv[1], "-h")) { 176 | print_help(argv[0]); 177 | exit(EXIT_SUCCESS); 178 | } 179 | 180 | // parse compression/decompression/service args 181 | if (!std::strcmp(argv[1], "c") || !std::strcmp(argv[1], "C")) { 182 | // check number of arguments 183 | if (argc != 9) { 184 | BOOST_LOG_TRIVIAL(error) << "invalid number of arguments"; 185 | exit(EXIT_FAILURE); 186 | } 187 | 188 | // parse paths + pagesize 189 | char *ref_path = argv[2]; 190 | char *src_path = argv[3]; 191 | char *out_path = argv[4]; 192 | uint32_t pagesize = atoi(argv[5]); 193 | 194 | // parse inner compression 195 | mscr::compression inner; 196 | if (!std::strcmp(argv[6], "bzip2")) { 197 | inner = mscr::compression::BZIP2; 198 | } else if (!std::strcmp(argv[6], "gzip")) { 199 | inner = mscr::compression::GZIP; 200 | } else if (!std::strcmp(argv[6], "7zip")) { 201 | inner = mscr::compression::ZIP7; 202 | } else if (!std::strcmp(argv[6], "0")) { 203 | inner = mscr::compression::NOINNER; 204 | } else { 205 | BOOST_LOG_TRIVIAL(error) << "invalid compression method chosen (" 206 | << "valid ones are: bzip2, gzip, 7zip, 0)"; 207 | exit(EXIT_FAILURE); 208 | } 209 | 210 | // parse diffing/intra 211 | bool diffing = atoi(argv[7]) == 1; 212 | bool intra = atoi(argv[8]) == 1; 213 | 214 | // show some info 215 | BOOST_LOG_TRIVIAL(info) << "compressing\n" 216 | << "refpath: " << ref_path << "\n" 217 | << "srcpath: " << src_path << "\n" 218 | << "outpath: " << out_path << "\n" 219 | << "pagesize: " << pagesize << "\n" 220 | << "compressing: " << argv[6] << "\n" 221 | << "diffing: " << diffing << "\n" 222 | << "intra: " << intra; 223 | 224 | // create and read refdump file 225 | std::shared_ptr ref = std::make_shared( 226 | ref_path); 227 | int ret = ref->readDumpfile(pagesize); 228 | if (ret) { 229 | BOOST_LOG_TRIVIAL(error) << "error when reading refdump"; 230 | exit(EXIT_FAILURE); 231 | } 232 | 233 | // create and read srcdump file 234 | mscr::memdump srcdump(src_path); 235 | ret = srcdump.readDumpfile(pagesize); 236 | if (ret) { 237 | BOOST_LOG_TRIVIAL(error) << "error when reading srcdump"; 238 | exit(EXIT_FAILURE); 239 | } 240 | 241 | // compress 242 | mscr::interdedup_compress(ref, srcdump, out_path, inner, diffing, intra); 243 | } else if (!std::strcmp(argv[1], "d") || !std::strcmp(argv[1], "D")) { 244 | // check number of arguments 245 | if (argc != 4) { 246 | BOOST_LOG_TRIVIAL(error) << "invalid number of arguments"; 247 | exit(EXIT_FAILURE); 248 | } 249 | 250 | // parse paths 251 | char *dump_path = argv[2]; 252 | char *out_path = argv[3]; 253 | 254 | // show some info 255 | BOOST_LOG_TRIVIAL(info) << "compressed dumpfile: " << dump_path << "\n" 256 | << "outfile: " << out_path << "\n" 257 | << "decompressing"; 258 | 259 | // decompress 260 | mscr::interdedup_decompress(nullptr, dump_path, out_path); 261 | } else if (!std::strcmp(argv[1], "s") || !std::strcmp(argv[1], "S")) { 262 | 263 | if (argc != 4) { 264 | BOOST_LOG_TRIVIAL(error) << "invalid number of arguments"; 265 | exit(EXIT_FAILURE); 266 | } 267 | // parse workerthread count 268 | uint32_t thread_count = atoi(argv[2]); 269 | // parse socket file path 270 | std::string socket_path(argv[3]); 271 | 272 | BOOST_LOG_TRIVIAL(info) << "starting service"; 273 | mscr::request_handler handler(thread_count); 274 | try { 275 | mscr::command_socket mscr_sock(socket_path, &handler); 276 | mscr_sock.start_listen(); 277 | } catch (std::runtime_error &excp) { 278 | BOOST_LOG_TRIVIAL(error) << "error occurred: " << excp.what(); 279 | exit(EXIT_FAILURE); 280 | } 281 | } else { 282 | BOOST_LOG_TRIVIAL(error) << "invalid first argument"; 283 | exit(EXIT_FAILURE); 284 | } 285 | 286 | BOOST_LOG_TRIVIAL(debug) << "removing log sinks"; 287 | /* this prevents crashes caused by boost final file rotation -- more details: 288 | * https://www.boost.org/doc/libs/1_68_0/libs/log/doc/html/log/rationale/why_crash_on_term.html */ 289 | boost::log::core::get()->remove_all_sinks(); 290 | 291 | return EXIT_SUCCESS; 292 | } 293 | 294 | -------------------------------------------------------------------------------- /memscrimper_poc/compress_interdedup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import defaultdict as dd 5 | 6 | import argparse 7 | import gzip 8 | import util 9 | import sys 10 | import os 11 | import logging 12 | import shutil 13 | import struct 14 | import subprocess 15 | import tempfile 16 | 17 | import bz2file 18 | 19 | 20 | def compress(source, target, reference, nointra, delta, inner, pagesize=4096): 21 | # some info 22 | logging.debug("Starting compression of %s to %s", repr(source), repr(target)) 23 | logging.debug("Page size: %d", pagesize) 24 | logging.debug("Reference dump: %s", reference) 25 | 26 | # pages + page numbers bookkeeping 27 | reference_pages, reference_pagenrs = [], {} 28 | for i, page in enumerate(util.get_pages(reference)): 29 | reference_pages.append(page) 30 | if page not in reference_pagenrs: 31 | reference_pagenrs[page] = i 32 | reference_pages_set = set(reference_pages) 33 | 34 | # find new + duplicatable pages 35 | dedups = dd(list) 36 | diffs = dd() 37 | diff_seen = set() 38 | if nointra: 39 | new_pagenrs = [] 40 | else: 41 | new_pagenrs = dd(list) 42 | new_pages = [] 43 | same_distinct, same_total = set(), 0 44 | source_pages = [] 45 | for i, page in enumerate(util.get_pages(source)): 46 | source_pages.append(page) 47 | if reference_pages[i] != page: 48 | if page not in reference_pages_set: 49 | if delta is not None: 50 | d = util.create_diff(reference_pages[i], page) 51 | if d is not None: 52 | diff_seen.add(page) 53 | diffs[i] = d 54 | continue 55 | if nointra: 56 | new_pagenrs.append(i) 57 | else: 58 | new_pagenrs[page].append(i) 59 | new_pages.append(page) 60 | else: 61 | dedups[page].append(i) 62 | else: 63 | same_total += 1 64 | same_distinct.add(page) 65 | source_pages_set = set(source_pages) 66 | newpagescnt = len(new_pages), len(set(new_pages)) 67 | 68 | # intervalize 69 | if nointra: 70 | new_pagenrs = util.intervalize(new_pagenrs) 71 | else: 72 | new_pagenrs = {page: util.intervalize(new_pagenrs[page]) for page in new_pagenrs} 73 | dedups = {page: util.intervalize(dedups[page]) for page in dedups} 74 | 75 | # write file 76 | util.create_dir(".tmp") 77 | tmphandle, tmpfile = tempfile.mkstemp(dir=".tmp") 78 | try: 79 | with open(tmpfile, "wb") as ftmp: 80 | ftmp.write(reference + "\x00") 81 | inorder = [] 82 | seen = set() 83 | for page in reference_pages: 84 | if page in dedups and page not in seen: 85 | inorder.append(page) 86 | seen.add(page) 87 | util.create_pagenr_list([reference_pagenrs[page] for page in inorder], ftmp) 88 | for page in inorder: 89 | ftmp.write(util.create_interval_list(dedups[page])) 90 | if delta is not None: 91 | util.create_pagenr_list(sorted(diffs), ftmp) 92 | for pagenr in sorted(diffs): 93 | ftmp.write(diffs[pagenr]) 94 | if nointra: 95 | ftmp.write(util.create_interval_list(new_pagenrs)) 96 | for page in new_pages: 97 | ftmp.write(page) 98 | else: 99 | ftmp.write(struct.pack(" 1 else None 278 | 279 | return nointra, delta, inner 280 | 281 | def main(): 282 | # cli args checking 283 | parser = argparse.ArgumentParser() 284 | subparsers = parser.add_subparsers(dest="action") 285 | parser_c = subparsers.add_parser("c") 286 | parser_c.add_argument("source", type=str) 287 | parser_c.add_argument("target", type=str) 288 | parser_c.add_argument("reference", type=str) 289 | parser_c.add_argument("--inner", type=str) 290 | parser_c.add_argument("--delta", type=str) 291 | parser_c.add_argument("--nointra", action="store_true") 292 | parser_d = subparsers.add_parser("d") 293 | parser_d.add_argument("source", type=str) 294 | parser_d.add_argument("target", type=str) 295 | args = parser.parse_args() 296 | 297 | # create method name 298 | if args.action == "c": 299 | method = create_method_name(args.nointra, args.delta, args.inner) 300 | elif args.action == "d": 301 | with open(args.source, "rb") as f: 302 | method = util.parse_header(f)[1] 303 | 304 | # set up logging 305 | util.create_dir("logs") 306 | util.configure_logging(method, "logs/{}.log".format(method)) 307 | 308 | # check if files do (not) exist 309 | if not os.path.isfile(args.source): 310 | logging.error("Source %s does not exist", repr(args.source)) 311 | return -1 312 | if os.path.isfile(args.target) and os.path.getsize(args.target) > 0: 313 | logging.error("Target %s already exists and is non-empty", repr(args.target)) 314 | return -1 315 | if args.action == "c" and not os.path.isfile(args.reference): 316 | logging.error("Reference %s does not exist", repr(args.reference)) 317 | return -1 318 | 319 | # compress/decompress 320 | if args.action == "c": 321 | return compress(args.source, args.target, args.reference, args.nointra, args.delta, args.inner) 322 | elif args.action == "d": 323 | return decompress(args.source, args.target) 324 | 325 | if __name__ == "__main__": 326 | sys.exit(main()) 327 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/interdedup_decompress.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "interdedup_decompress.h" 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "memdump.h" 24 | #include "request_handler.h" 25 | #include "utils.h" 26 | 27 | namespace mscr { 28 | 29 | enum class compression { 30 | ZIP7, GZIP, BZIP2, NOINNER 31 | }; 32 | 33 | 34 | static int decompress_file(const std::string &file_in, const compression &inner, 35 | std::string *file_out) { 36 | /* returns the decompressed body from the 37 | * chosen compression */ 38 | BOOST_LOG_TRIVIAL(debug) << "starting inner decompression"; 39 | std::stringstream input_stream(file_in); 40 | boost::iostreams::filtering_istream in; 41 | switch (inner) { 42 | case compression::ZIP7: 43 | in.push(boost::iostreams::lzma_decompressor()); 44 | break; 45 | case compression::GZIP: 46 | in.push(boost::iostreams::gzip_decompressor()); 47 | break; 48 | case compression::BZIP2: 49 | in.push(boost::iostreams::bzip2_decompressor()); 50 | break; 51 | case compression::NOINNER: 52 | // nothing to do here 53 | *file_out = file_in; 54 | BOOST_LOG_TRIVIAL(debug) << "finished inner decompression"; 55 | return 0; 56 | } 57 | in.push(input_stream); 58 | std::stringstream readbuf; 59 | boost::iostreams::copy(in, readbuf); 60 | *file_out = readbuf.str(); 61 | BOOST_LOG_TRIVIAL(debug) << "finished inner decompression"; 62 | return 0; 63 | } 64 | 65 | 66 | static int read_header(std::istream *file, std::string *method, 67 | uint32_t *pagesize, uint64_t *uncompressed_size) { 68 | // parse magic number 69 | std::string magicnum = read_string(file); 70 | BOOST_LOG_TRIVIAL(debug) << "reading header"; 71 | BOOST_LOG_TRIVIAL(debug) << "\t magicnum: " << magicnum; 72 | if (magicnum != "MBCR") { 73 | BOOST_LOG_TRIVIAL(error) << "magic number mismatch"; 74 | return 1; 75 | } 76 | 77 | // parse method 78 | *method = read_string(file); 79 | BOOST_LOG_TRIVIAL(debug) << "\t method: " << *method; 80 | 81 | // parse major version 82 | auto major_version = read_num_LE(file, 2); 83 | BOOST_LOG_TRIVIAL(debug) << "\t major version: " << major_version; 84 | 85 | // parse minor version 86 | auto minor_version = read_num_LE(file, 2); 87 | BOOST_LOG_TRIVIAL(debug) << "\t minor version: " << minor_version; 88 | 89 | // parse page size 90 | *pagesize = read_num_LE(file, 4); 91 | BOOST_LOG_TRIVIAL(debug) << "\t pagesize: " << *pagesize; 92 | 93 | // parse uncompressed size 94 | *uncompressed_size = read_num_LE(file, 8); 95 | BOOST_LOG_TRIVIAL(debug) << "\t uncompressed size: " << *uncompressed_size; 96 | BOOST_LOG_TRIVIAL(debug) << "finished reading header"; 97 | 98 | return 0; 99 | } 100 | 101 | 102 | static int parse_method(std::string method, compression *inner, bool *intra, 103 | bool *diffing) { 104 | // sanity check for interdedup 105 | if (method.find("interdedup") == std::string::npos) { 106 | return 1; 107 | } 108 | method.erase(0, ((std::string) "interdedup").size()); 109 | 110 | // check intra 111 | if (str_starts_with(method, "nointra")) { 112 | *intra = false; 113 | method.erase(0, ((std::string) "nointra").size()); 114 | } else { 115 | *intra = true; 116 | } 117 | 118 | // check diffing 119 | if (str_starts_with(method, "delta")) { 120 | *diffing = true; 121 | method.erase(0, ((std::string) "delta").size()); 122 | } else { 123 | *diffing = false; 124 | } 125 | 126 | // check inner compression 127 | if (str_starts_with(method, "7zip")) { 128 | *inner = compression::ZIP7; 129 | } else if (str_starts_with(method, "gzip")) { 130 | *inner = compression::GZIP; 131 | } else if (str_starts_with(method, "bzip2")) { 132 | *inner = compression::BZIP2; 133 | } else { 134 | *inner = compression::NOINNER; 135 | 136 | // we should have consumed the whole method string 137 | if (!method.empty()) { 138 | BOOST_LOG_TRIVIAL(error) << "Method is not empty after parsing."; 139 | return 1; 140 | } 141 | } 142 | return 0; 143 | } 144 | 145 | 146 | static std::vector parse_pagenr_list(std::istream *fp) { 147 | // will hold the final list of page numbers 148 | std::vector pagenr_list; 149 | uint32_t sz = read_num_LE(fp, 4); 150 | pagenr_list.reserve(sz); 151 | 152 | // keeps track of the previous number for delta encoding 153 | uint32_t prev = 0; 154 | 155 | // go through the list number by number 156 | for (uint32_t i = 0; i < sz; ++i) { 157 | // parse number 158 | uint32_t num; 159 | uint32_t curr1B = read_num_LE(fp, 1); 160 | 161 | if ((curr1B & 128) == 128) { 162 | // msb = 1 --> right 7 bits used for number 163 | num = curr1B & 127; 164 | } else { 165 | // msb = 0 --> read 3 more bytes 166 | uint32_t secByte = read_num_LE(fp, 1); 167 | uint32_t thirdByte = read_num_LE(fp, 1); 168 | uint32_t fourthByte = read_num_LE(fp, 1); 169 | num = (curr1B << 24) | (secByte << 16) | (thirdByte << 8) | fourthByte; 170 | } 171 | 172 | // decode delta if it's not the first number 173 | if (i == 0) { 174 | pagenr_list.push_back(num); 175 | prev = num; 176 | } else { 177 | num = prev + num + 1; 178 | pagenr_list.push_back(num); 179 | prev = num; 180 | } 181 | } 182 | 183 | return pagenr_list; 184 | } 185 | 186 | 187 | static void parse_interval(std::istream *fp, bool *last, uint32_t *left, 188 | uint32_t *right) { 189 | /* interval [l, r] 190 | * format: t[1b], d[2b], l[29b], r-l[{0,1,2,4}B] 191 | * t = termination bit 192 | * d = size of r -l 193 | * l = page number after left side of interval 194 | * r = offset to the right side 195 | */ 196 | 197 | // parse t, d and l 198 | uint32_t left4B = read_num_LE(fp, 4); 199 | uint32_t upper3b = (left4B & (7 << 29)) >> 29; 200 | uint32_t size = upper3b & 3; 201 | *last = (upper3b >> 2) == 1; 202 | *left = left4B & ((1 << 29) - 1); 203 | 204 | // d = 0b11 encodes 4 205 | if (size == 3) { 206 | size = 4; 207 | } 208 | 209 | // compute r 210 | if (size == 0) { 211 | *right = *left; 212 | } else if (size == 1 || size == 2 || size == 4) { 213 | uint32_t delta = read_num_LE(fp, size); 214 | *right = *left + delta; 215 | } else { 216 | BOOST_LOG_TRIVIAL(error) << "invalid interval size: " << size; 217 | } 218 | } 219 | 220 | 221 | static std::vector> parse_interval_list( 222 | std::istream *fp) { 223 | // interval list = list of (left, right) pairs 224 | std::vector> intervals; 225 | 226 | // parse until we see a set termination bit 227 | bool last = false; 228 | while (!last) { 229 | uint32_t left, right; 230 | parse_interval(fp, &last, &left, &right); 231 | std::pair iv = std::make_pair(left, right); 232 | intervals.push_back(iv); 233 | } 234 | return intervals; 235 | } 236 | 237 | 238 | static std::pair decode_patch(std::istream *fp) { 239 | // parse first two bytes 240 | uint32_t firstB = read_num_LE(fp, 1); 241 | uint32_t secB = read_num_LE(fp, 1); 242 | 243 | // decode size + offset 244 | if ((firstB & 128) == 128) { 245 | // first bit is set --> encoded in 3B 246 | uint32_t thirdB = read_num_LE(fp, 1); 247 | 248 | // decode patch 249 | firstB &= 127; 250 | uint32_t rebuildB = (firstB << 16) | (secB << 8) | thirdB; 251 | uint32_t size = 1 + ((rebuildB & 0xFFF000) >> 12); 252 | uint32_t offset = rebuildB & 0xFFF; 253 | 254 | return std::make_pair(size, offset); 255 | } else { 256 | // msb is not set --> simply take the two bytes 257 | firstB += 1; // we encoded length - 1 258 | 259 | return std::make_pair(firstB, secB); 260 | } 261 | } 262 | 263 | 264 | static std::vector> parse_diff( 265 | std::istream *fp) { 266 | // diff = list of patches 267 | std::vector> diff; 268 | 269 | // parse number of patches 270 | uint32_t patch_count = read_num_LE(fp, 2); 271 | 272 | // parse patch by patch 273 | for (uint32_t i = 0; i < patch_count; i++) { 274 | std::pair size_offset = decode_patch(fp); 275 | 276 | // a patch should never be larger than 2KiB 277 | assert(size_offset.first <= 2048); 278 | char readbuf[2048]; 279 | 280 | // read complete patch and put it into the diff 281 | fp->read(readbuf, size_offset.first); 282 | std::string patchbytes(readbuf, readbuf + size_offset.first); 283 | diff.emplace_back(size_offset.second, patchbytes); 284 | } 285 | return diff; 286 | } 287 | 288 | 289 | static std::string apply_diff(const std::string &refpage, 290 | const std::vector> &diff) { 291 | // will hold the final page after the patches have been applied 292 | std::string rebuild_page = refpage; 293 | 294 | // apply patch by patch 295 | uint32_t offset = 0; 296 | for (auto patch : diff) { 297 | // "seek" to the right offset 298 | offset += patch.first; 299 | 300 | // apply the patch by replacing bytes accordingly 301 | std::string patchbytes = patch.second; 302 | for (uint32_t i = 0; i < patchbytes.size(); i++) { 303 | rebuild_page[offset + i] = patchbytes[i]; 304 | } 305 | 306 | // continue "seeking" 307 | offset += patchbytes.size(); 308 | } 309 | return rebuild_page; 310 | } 311 | 312 | 313 | void interdedup_decompress(request_handler *handler, const char *filename_in, 314 | const char *out_filename) { 315 | 316 | // open compressed file for reading 317 | std::ifstream f_compressed(filename_in, std::ios::binary); 318 | if (f_compressed.fail()) { 319 | BOOST_LOG_TRIVIAL(error) << "error opening dumpfile " << filename_in; 320 | return; 321 | } 322 | 323 | // set pagesize + uncompressed_size 324 | uint32_t pagesize; 325 | std::string method; 326 | uint64_t uncompressed_size; 327 | int ret = read_header(&f_compressed, &method, &pagesize, &uncompressed_size); 328 | if (ret) { 329 | BOOST_LOG_TRIVIAL(error) << "error in header parsing"; 330 | return; 331 | } 332 | 333 | // set inner + intra + diffing 334 | compression compression_used; 335 | bool intra_used; 336 | bool diffing; 337 | ret = parse_method(method, &compression_used, &intra_used, &diffing); 338 | if (ret) { 339 | BOOST_LOG_TRIVIAL(error) << "error in method parsing"; 340 | return; 341 | } 342 | 343 | // extract body 344 | std::noskipws(f_compressed); // prevents iterator from skipping whitespace 345 | std::string compressed_body(std::istream_iterator(f_compressed), {}); 346 | 347 | // decompress body 348 | std::string file_body; 349 | int res = decompress_file(compressed_body, compression_used, &file_body); 350 | if (res != 0 || file_body.empty()) { 351 | BOOST_LOG_TRIVIAL(error) << "inner decompression failure"; 352 | return; 353 | } 354 | // convert body to stream so we can easily operate on it 355 | std::stringstream f_body(file_body); 356 | 357 | BOOST_LOG_TRIVIAL(debug) << "got uncompressed file body"; 358 | 359 | std::string ref_dump_path = read_string(&f_body); 360 | if (ref_dump_path.empty()) { 361 | BOOST_LOG_TRIVIAL(error) << "invalid reference dump path in header"; 362 | return; 363 | } 364 | BOOST_LOG_TRIVIAL(debug) << "reference dump: " << ref_dump_path; 365 | 366 | // parse reference pagenrs + intervals 367 | std::unordered_map fills; 368 | std::vector ref_pagenrs = parse_pagenr_list(&f_body); 369 | for (uint32_t ref_pagenr : ref_pagenrs) { 370 | auto iv_list = parse_interval_list(&f_body); 371 | for (std::pair interval : iv_list) { 372 | uint32_t left = interval.first; 373 | uint32_t right = interval.second; 374 | // sanity check 375 | if (left > right) { 376 | BOOST_LOG_TRIVIAL(error) << "invalid interval"; 377 | return; 378 | } 379 | 380 | // map deduplicated page numbers to page numbers in reference dump 381 | for (uint32_t pagenr = left; pagenr <= right; ++pagenr) { 382 | fills[pagenr] = ref_pagenr; 383 | } 384 | } 385 | } 386 | 387 | // parse diffs 388 | std::unordered_map>> 389 | diffs; 390 | if (diffing) { 391 | std::vector diffpages = parse_pagenr_list(&f_body); 392 | for (uint32_t pagenum : diffpages) { 393 | diffs[pagenum] = parse_diff(&f_body); 394 | } 395 | } 396 | 397 | // parse new pages 398 | std::unordered_map newpages; 399 | std::set newdistinct; 400 | if (!intra_used) { 401 | auto iv_newpages = parse_interval_list(&f_body); 402 | auto page_content = new char[pagesize]; 403 | 404 | // parse intervals and pages accordingly 405 | for (std::pair interval : iv_newpages) { 406 | uint32_t left = interval.first; 407 | uint32_t right = interval.second; 408 | for (uint32_t pagenr = left; pagenr <= right; ++pagenr) { 409 | f_body.read(page_content, pagesize); 410 | 411 | // explicit cast because of null-byte issues 412 | std::string page_content_str(page_content, page_content + pagesize); 413 | 414 | newpages[pagenr] = page_content_str; 415 | newdistinct.insert(page_content_str); 416 | } 417 | } 418 | delete[](page_content); 419 | } else { 420 | // number of intradeduplicate new pages 421 | uint32_t page_count = read_num_LE(&f_body, 4); 422 | 423 | // parse intervals (one for each page) 424 | std::vector>> intervals; 425 | intervals.reserve(page_count); 426 | for (uint32_t i = 0; i < page_count; i++) { 427 | auto iv = parse_interval_list(&f_body); 428 | intervals.push_back(std::move(iv)); 429 | } 430 | 431 | // unfold intradeduplication 432 | auto page_content = new char[pagesize]; 433 | for (uint32_t i = 0; i < page_count; i++) { 434 | f_body.read(page_content, pagesize); 435 | std::string page(page_content, page_content + pagesize); 436 | for (auto iv : (intervals[i])) { 437 | uint32_t left = iv.first; 438 | uint32_t right = iv.second; 439 | for (uint32_t pnum = left; pnum <= right; pnum++) { 440 | newpages[pnum] = page; 441 | } 442 | } 443 | } 444 | delete[](page_content); 445 | } 446 | 447 | // load reference dump 448 | BOOST_LOG_TRIVIAL(debug) << "loading refdump"; 449 | std::shared_ptr refdump; 450 | if (handler != nullptr) { 451 | refdump = handler->get_refdump(ref_dump_path, pagesize); 452 | } else { 453 | refdump = std::make_shared(ref_dump_path); 454 | refdump->readDumpfile(pagesize); 455 | } 456 | 457 | // parse reference dump (if not done already) 458 | auto ref_pages = refdump->getNumToPage(); 459 | 460 | // open final file (append .processing because we are not finished yet) 461 | std::string out_filename_processing(out_filename); 462 | out_filename_processing += ".processing"; 463 | auto f_out = std::ofstream(out_filename_processing.c_str(), std::ios::binary); 464 | 465 | // reconstruct page by page 466 | for (uint32_t pagenr = 0; pagenr < uncompressed_size / pagesize; pagenr++) { 467 | if (fills.count(pagenr) > 0) { 468 | // we got a deduplicated page with a different page number 469 | uint32_t refnum = fills[pagenr]; 470 | std::string page_content = (*ref_pages)[refnum]; 471 | f_out.write(page_content.c_str(), pagesize); 472 | } else if (diffing && (diffs.count(pagenr) > 0)) { 473 | // we got a diffed page 474 | auto page_content = apply_diff((*ref_pages)[pagenr], diffs[pagenr]); 475 | if (page_content.empty()) { 476 | BOOST_LOG_TRIVIAL(error) << "aborting due to error when applying diffs"; 477 | return; 478 | } 479 | f_out.write(page_content.c_str(), pagesize); 480 | } else if (newpages.count(pagenr) > 0) { 481 | // we got a completely new page 482 | std::string page_content = newpages[pagenr]; 483 | f_out.write(page_content.c_str(), pagesize); 484 | } else { 485 | // we got a deduplicated page at the same page number 486 | std::string page_content = (*ref_pages)[pagenr]; 487 | f_out.write(page_content.c_str(), pagesize); 488 | } 489 | } 490 | 491 | // flush stream before renaming file 492 | f_out.close(); 493 | 494 | // remove the file if it already exists. This is faster than overwriting 495 | std::remove(out_filename); 496 | 497 | // remove temporary file 498 | std::rename(out_filename_processing.c_str(), out_filename); 499 | BOOST_LOG_TRIVIAL(info) << "decompressed file was saved as " << out_filename; 500 | } 501 | 502 | } // namespace mscr 503 | -------------------------------------------------------------------------------- /memscrimper_cpp_implementation/src/interdedup_compress.cpp: -------------------------------------------------------------------------------- 1 | // Copyright [2019] 2 | 3 | #include "interdedup_compress.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "memdump.h" 25 | #include "utils.h" 26 | 27 | 28 | namespace mscr { 29 | 30 | static void create_pagenr_list(const std::set &nums, 31 | std::string *s_out) { 32 | // write number of pages 33 | auto sz = static_cast(nums.size()); 34 | std::vector len = int_to_byte_LE(sz, 4); 35 | s_out->append(&len[0], 4); 36 | 37 | // write pagenr by pagenr 38 | bool first = true; 39 | uint32_t prev = 0; 40 | uint32_t curr; 41 | for (const uint32_t &page_num : nums) { 42 | // delta-encode page number 43 | if (first) { 44 | curr = page_num; 45 | first = false; 46 | } else { 47 | curr = page_num - prev - 1; 48 | } 49 | 50 | // write page number 51 | if (curr < 128) { 52 | // we fit in one byte -> we set the highest bit to 1 53 | std::vector page_numb = int_to_byte_BE(curr | 128, 1); 54 | s_out->append(&page_numb[0], 1); 55 | } else { 56 | // we need 4 bytes to store the pagenum 57 | std::vector page_numb = int_to_byte_BE(curr, 4); 58 | s_out->append(&page_numb[0], 4); 59 | } 60 | prev = page_num; 61 | } 62 | } 63 | 64 | 65 | static std::vector> intervalize( 66 | const std::set &numbers) { 67 | // check if empty 68 | std::vector> result; 69 | if (numbers.empty()) { 70 | return result; 71 | } 72 | 73 | // holds the current interval 74 | std::pair curr = std::make_pair(*(numbers.begin()), 75 | *(numbers.begin())); 76 | 77 | // build the intervals starting from the second element on 78 | for (auto it = ++std::begin(numbers); it != numbers.end(); it++) { 79 | uint32_t x = *it; 80 | if (curr.second + 1 == x) { 81 | curr.second = x; 82 | } else { 83 | result.push_back(curr); 84 | curr.first = x; 85 | curr.second = x; 86 | } 87 | } 88 | result.push_back(curr); 89 | 90 | return result; 91 | } 92 | 93 | 94 | static std::string create_interval(uint32_t left, uint32_t right, bool islast) { 95 | /* interval [l, r] 96 | * format: t[1b], d[2b], l[29b], r-l[{0,1,2,4}B] 97 | * t = termination bit 98 | * d = size of r -l 99 | * l = page number after left side of interval 100 | * r = offset to the right side */ 101 | 102 | // check bounds 103 | if (left >= (1 << 29)) { 104 | BOOST_LOG_TRIVIAL(error) << "left interval is too big"; 105 | return ""; 106 | } 107 | 108 | // determine termination bit 109 | uint32_t last; 110 | if (islast) { 111 | last = 4; 112 | } else { 113 | last = 0; 114 | } 115 | 116 | /* we do not append r-l when we encode only 1 page this will shift the 117 | * termination bit to the left and write the pagenum */ 118 | uint32_t bytelen = 0; 119 | uint32_t data = 0; 120 | if (left == right) { 121 | data = ((last << 29) | left); 122 | std::vector ch_vec = int_to_byte_LE(data, 4); 123 | return std::string(ch_vec.begin(), ch_vec.end()); 124 | } 125 | 126 | // encode delta 127 | uint32_t delta = right - left; 128 | if (delta < (1 << 8)) { 129 | bytelen = 1; 130 | data = ((last | 1) << 29) | left; 131 | } else if (delta < (1 << 16)) { 132 | bytelen = 2; 133 | data = ((last | 2) << 29) | left; 134 | } else { 135 | bytelen = 4; 136 | data = ((last | 3) << 29) | left; 137 | } 138 | 139 | // encode everything 140 | std::vector ch_vec = int_to_byte_LE(data, 4); 141 | std::vector ch_vec_RF = int_to_byte_LE(delta, bytelen); 142 | ch_vec.insert(ch_vec.end(), ch_vec_RF.begin(), ch_vec_RF.end()); 143 | 144 | return std::string(ch_vec.begin(), ch_vec.end()); 145 | } 146 | 147 | 148 | static std::vector create_interval_list( 149 | const std::vector> &intervals) { 150 | std::vector result; 151 | uint32_t len = intervals.size(); 152 | result.reserve(len); 153 | 154 | for (uint32_t i = 0; i < len; ++i) { 155 | auto interval = intervals[i]; 156 | uint32_t left = interval.first; 157 | uint32_t right = interval.second; 158 | result.push_back(create_interval(left, right, i + 1 == len)); 159 | } 160 | 161 | return result; 162 | } 163 | 164 | 165 | static std::string create_method(bool intra, bool diffing, 166 | const compression &inner) { 167 | std::string method = "interdedup"; 168 | 169 | // encode intra 170 | if (!intra) { 171 | method.append("nointra"); 172 | } 173 | 174 | // encode diffing 175 | if (diffing) { 176 | method.append("delta"); 177 | } 178 | 179 | // encode inner compression 180 | switch (inner) { 181 | case compression::GZIP: 182 | method.append("gzip"); 183 | break; 184 | case compression::ZIP7: 185 | method.append("7zip"); 186 | break; 187 | case compression::BZIP2: 188 | method.append("bzip2"); 189 | break; 190 | case compression::NOINNER: 191 | break; 192 | } 193 | 194 | return method; 195 | } 196 | 197 | 198 | static std::string create_header(const std::string &method, 199 | uint64_t uncompressed_size, uint32_t majorversion, uint32_t minorversion, 200 | const std::string &magicnum, uint32_t pagesize) { 201 | std::string head; 202 | 203 | /* encode magic number + method (note: pushing instead of appending because 204 | * of null-bytes) */ 205 | head += magicnum; 206 | head.push_back('\0'); // += operator will only copy till it hits nullbyte 207 | head += method; 208 | head.push_back('\0'); 209 | 210 | // encode major version 211 | for (const char c : int_to_byte_LE(majorversion, 2)) { 212 | head.push_back(c); 213 | } 214 | 215 | // encode minor version 216 | for (const char c : int_to_byte_LE(minorversion, 2)) { 217 | head.push_back(c); 218 | } 219 | 220 | // encode page size 221 | for (const char c : int_to_byte_LE(pagesize, 4)) { 222 | head.push_back(c); 223 | } 224 | 225 | // encode uncompressed size 226 | for (const char c : int_to_byte_LE(uncompressed_size, 8)) { 227 | head.push_back(c); 228 | } 229 | 230 | return head; 231 | } 232 | 233 | 234 | static int compress_file(const std::string &file_in, const compression &inner, 235 | std::string *file_out) { 236 | /* returns the compressed body from the 237 | * chosen compression */ 238 | BOOST_LOG_TRIVIAL(debug) << "starting inner compression"; 239 | boost::iostreams::filtering_ostream out; 240 | boost::iostreams::lzma_params lzma_par; 241 | switch (inner) { 242 | case compression::ZIP7: 243 | lzma_par.level = boost::iostreams::lzma::default_compression; 244 | out.push(boost::iostreams::lzma_compressor(lzma_par)); 245 | break; 246 | case compression::BZIP2: 247 | out.push(boost::iostreams::bzip2_compressor()); 248 | break; 249 | case compression::GZIP: 250 | out.push(boost::iostreams::gzip_compressor()); 251 | break; 252 | case compression::NOINNER: 253 | // nothing to do here 254 | *file_out = file_in; 255 | BOOST_LOG_TRIVIAL(debug) << "finished inner compression"; 256 | return 0; 257 | } 258 | std::stringstream writebuf; 259 | out.push(writebuf); 260 | std::stringstream input_stream(file_in); 261 | boost::iostreams::copy(input_stream, out); 262 | *file_out = writebuf.str(); 263 | BOOST_LOG_TRIVIAL(debug) << "finished inner compression"; 264 | return 0; 265 | } 266 | 267 | 268 | static int generate_patches(const std::string &ref_page, 269 | const std::string &delta_page, 270 | std::vector> *out) { 271 | /* note that both pages have to be the same size in order for this to make 272 | * sense */ 273 | uint32_t pagesize = ref_page.size(); 274 | 275 | // keeps track of the previous indices 276 | uint32_t previ = 0; 277 | bool first = true; 278 | 279 | // keeps track of streaks of identical bytes 280 | std::string samebytes; 281 | 282 | // encodes a list of patches, i.e., a list of (offset, bytes) pairs 283 | std::vector> ret; 284 | 285 | for (uint32_t i = 0; i < pagesize; i++) { 286 | // compare byte by byte 287 | char ref_byte = ref_page[i]; 288 | char delta_byte = delta_page[i]; 289 | 290 | if (ref_byte == delta_byte) { 291 | // keep track of streaks of identical bytes 292 | samebytes += delta_byte; 293 | } else { 294 | // build the current patch 295 | std::pair curr_patch; 296 | 297 | /* if the streak of identical bytes is not larger than 2, we would need 298 | * two patches whose overhead would increase 2 bytes - hence, we just 299 | * include the identical bytes in a single patch */ 300 | if (samebytes.size() <= 2 && !first) { 301 | ret.back().second += samebytes; 302 | ret.back().second += delta_byte; 303 | } else { 304 | if (first) { 305 | first = false; 306 | curr_patch.first = i; 307 | curr_patch.second.clear(); 308 | } else { 309 | curr_patch.first = i - previ - ret.back().second.size(); 310 | curr_patch.second.clear(); 311 | } 312 | previ = i; 313 | curr_patch.second += (delta_byte); 314 | ret.push_back(curr_patch); 315 | } 316 | samebytes.clear(); 317 | } 318 | } 319 | 320 | /* make sure individual patches are smaller than 2048 bytes to comply with 321 | * our file format specification, which is achieved by partitioning longer 322 | * patches into chunks of 2048 bytes */ 323 | for (auto patch : ret) { 324 | uint32_t offset = patch.first; 325 | std::string bytes = patch.second; 326 | uint32_t bytelen = patch.second.size(); 327 | if (bytelen > 2048) { 328 | std::string first_data = bytes.substr(0, 2048); 329 | std::string second_data = bytes.substr(2048, 2048); 330 | out->emplace_back(offset, first_data); 331 | out->emplace_back(0, second_data); 332 | } else { 333 | out->emplace_back(offset, patch.second); 334 | } 335 | } 336 | 337 | return 0; 338 | } 339 | 340 | 341 | static std::string patch_encode(uint32_t offset, uint32_t len) { 342 | // will hold the encoding of l - 1 and o 343 | std::vector ret_vec; 344 | 345 | // we encode l - 1 346 | len -= 1; 347 | 348 | if (offset < 256 && len < 128) { 349 | /* encode o and l with two bytes such that the msb of the first byte is set 350 | * to 0 */ 351 | std::vector len_vec = int_to_byte_BE(len, 1); 352 | ret_vec.insert(ret_vec.end(), len_vec.begin(), len_vec.end()); 353 | std::vector off_vec = int_to_byte_BE(offset, 1); 354 | ret_vec.insert(ret_vec.end(), off_vec.begin(), off_vec.end()); 355 | } else { 356 | /* encode o and l with three bytes such that the msb of the first byte is 357 | * set to 1 */ 358 | uint32_t len_off = (len << 12) | offset; 359 | uint32_t a = (len_off & 0xFF0000) >> 16; 360 | a |= 128; 361 | std::vector a_vec = int_to_byte_BE(a, 1); 362 | ret_vec.insert(ret_vec.end(), a_vec.begin(), a_vec.end()); 363 | std::vector last2 = int_to_byte_BE(len_off & 0xFFFF, 2); 364 | ret_vec.insert(ret_vec.end(), last2.begin(), last2.end()); 365 | } 366 | 367 | return std::string(ret_vec.begin(), ret_vec.end()); 368 | } 369 | 370 | 371 | static std::string create_diff(const std::string &ref_page, 372 | const std::string &delta_page) { 373 | // generate patches 374 | std::vector> patches; 375 | generate_patches(ref_page, delta_page, &patches); 376 | 377 | // create diff patch by patch 378 | std::string diff; 379 | uint32_t patchnum = 0; 380 | for (auto patch : patches) { 381 | patchnum++; 382 | uint32_t offset = patch.first; 383 | std::string bytes = patch.second; 384 | diff += patch_encode(offset, bytes.size()); 385 | diff += bytes; 386 | } 387 | 388 | // build final diff 389 | std::vector num_bin = int_to_byte_LE(patchnum, 2); 390 | std::string res = std::string(num_bin.begin(), num_bin.end()); 391 | res += diff; 392 | 393 | return res; 394 | } 395 | 396 | 397 | void interdedup_compress(std::shared_ptr ref, const memdump &srcdump, 398 | const char *out_filename, compression inner, bool diffing, bool intra) { 399 | // keep track of pages of both the reference and the source dump 400 | auto src_pages = srcdump.getPages(); 401 | auto ref_pages = ref->getPages(); 402 | uint32_t pagesize = ref_pages->begin()->first.size(); 403 | 404 | // print some debug output 405 | if (diffing) { 406 | BOOST_LOG_TRIVIAL(debug) << "DIFFING enabled"; 407 | } 408 | if (intra) { 409 | BOOST_LOG_TRIVIAL(debug) << "INTRA enabled"; 410 | } 411 | 412 | // maps each diffable page num to a diff 413 | std::unordered_map diffs; 414 | 415 | // collects all new (undiffable) page numbers we cannot deduplicate 416 | std::set new_pagenrs; 417 | 418 | // maps page numbers of new/unique pages to their content 419 | std::unordered_map new_pages; 420 | 421 | // maps refnumbs to page numbers that will be deduped by it 422 | std::unordered_map> dedups; 423 | 424 | // holds all page numbers of deduplicated pages 425 | std::set dedup_pagenrs; 426 | 427 | // maps page numbers to pages of the reference memory dump 428 | auto ref_num_to_page = ref->getNumToPage(); 429 | 430 | // holds page numbers of diffable pages 431 | std::set diff_pagenrs; 432 | 433 | /* maps each page to the page numbers where it occurs (will be used for 434 | * intradeduplicating the new pages */ 435 | std::unordered_map> same_newpages; 436 | 437 | // go through each page of the source dump 438 | for (const auto &key_value : *src_pages) { 439 | std::string srcpage = key_value.first; 440 | 441 | if (ref_pages->find(srcpage) != ref_pages->end()) { 442 | /* this page occurs in the reference dump 443 | * get all the page numbers where the page occurrs in the source dump, 444 | * but not in the reference dump */ 445 | std::set dedup_pages; 446 | std::set_difference((*src_pages).at(srcpage).begin(), 447 | (*src_pages).at(srcpage).end(), 448 | (*ref_pages).at(srcpage).begin(), 449 | (*ref_pages).at(srcpage).end(), 450 | std::inserter(dedup_pages, dedup_pages.begin())); 451 | 452 | // if thare are >0 such page numbers, then add them to our dedups 453 | if (!dedup_pages.empty()) { 454 | uint32_t pagenr = *((*ref_pages).at(srcpage).begin()); 455 | dedups[pagenr] = dedup_pages; 456 | dedup_pagenrs.insert(pagenr); 457 | } 458 | } else { 459 | /* the page is not in the reference dump, but maybe a similar page sits 460 | * somewhere else */ 461 | for (const uint32_t &pagenum : (*src_pages).at(srcpage)) { 462 | // go through all page numbers of the page 463 | if (diffing) { 464 | // diff the page with its counterpart in the reference dump 465 | std::string diff = create_diff((*ref_num_to_page)[pagenum], srcpage); 466 | 467 | /* remmber the diff if if storing it is cheaper than storing the page 468 | * itself */ 469 | if (diff.size() < pagesize) { 470 | diffs[pagenum] = diff; 471 | diff_pagenrs.insert(pagenum); 472 | continue; 473 | } 474 | } 475 | 476 | /* if requested, keep track of intradeduplicate occurrences if diffing 477 | * was not successful, otherwise just consider it a new unique page */ 478 | if (intra) { 479 | (same_newpages[srcpage]).insert(pagenum); 480 | } else { 481 | new_pagenrs.insert(pagenum); 482 | new_pages[pagenum] = srcpage; 483 | } 484 | } 485 | } 486 | } 487 | 488 | std::string tmpf; 489 | // write reference dump path 490 | tmpf.append(ref->getPath().c_str(), ref->getPath().size()); 491 | tmpf.append("\0", 1); 492 | 493 | // write page number list containing all deduplicatable page numbers 494 | create_pagenr_list(dedup_pagenrs, &tmpf); 495 | 496 | // write interval lists for each deduplicated page 497 | for (const auto &dedup_pnum : dedup_pagenrs) { 498 | auto dedupset = dedups[dedup_pnum]; 499 | auto iv_list = create_interval_list(intervalize(dedupset)); 500 | for (const std::string &iv : iv_list) { 501 | tmpf.append(iv.c_str(), iv.size()); 502 | } 503 | } 504 | 505 | // write diffs if requested 506 | if (diffing) { 507 | create_pagenr_list(diff_pagenrs, &tmpf); 508 | for (auto pagenum : diff_pagenrs) { 509 | std::string diff = diffs[pagenum]; 510 | tmpf.append(diff.c_str(), diff.size()); 511 | } 512 | } 513 | 514 | BOOST_LOG_TRIVIAL(debug) << "wrote diffs + interval-lists to file"; 515 | if (intra) { 516 | // intervalize new pages 517 | std::vector intrapages; 518 | intrapages.reserve(same_newpages.size()); 519 | std::unordered_map>> new_pagenrs_iv; 521 | for (auto key_value : same_newpages) { 522 | std::string page = key_value.first; 523 | intrapages.push_back(page); 524 | new_pagenrs_iv[page] = (intervalize(same_newpages[page])); 525 | } 526 | 527 | // write number of distinct new pages 528 | std::vector num_vec = int_to_byte_LE(new_pagenrs_iv.size(), 4); 529 | tmpf.append(&num_vec[0], num_vec.size()); 530 | 531 | // write intervals of page numbers 532 | for (const std::string &page : intrapages) { 533 | auto ivlist = create_interval_list(new_pagenrs_iv[page]); 534 | for (auto iv : ivlist) { 535 | tmpf.append(&iv[0], iv.size()); 536 | } 537 | } 538 | 539 | // write the actual pages covering all intervals 540 | for (const std::string &page : intrapages) { 541 | tmpf.append(page.c_str(), page.size()); 542 | } 543 | } else { 544 | // write intervalized new page numbers 545 | auto new_pagenrs_iv = intervalize(new_pagenrs); 546 | for (const std::string &iv : create_interval_list(new_pagenrs_iv)) { 547 | tmpf.append(iv.c_str(), iv.size()); 548 | } 549 | 550 | // write actual new unique pages 551 | for (const uint32_t &pagenr : new_pagenrs) { 552 | std::string page = new_pages[pagenr]; 553 | tmpf.append(page.c_str(), page.size()); 554 | } 555 | } 556 | 557 | // apply inner compression 558 | std::string tmpfile2; 559 | int ret = compress_file(tmpf, inner, &tmpfile2); 560 | if (ret != 0) { 561 | BOOST_LOG_TRIVIAL(error) << "inner compression failed"; 562 | return; 563 | } 564 | 565 | // write header 566 | std::string method = create_method(intra, diffing, inner); 567 | uint64_t filesize = get_filesize(srcdump.getPath().c_str()); 568 | if (filesize <= 0) { 569 | BOOST_LOG_TRIVIAL(error) << "filesize of " << srcdump.getPath() 570 | << " is invalid."; 571 | return; 572 | } 573 | BOOST_LOG_TRIVIAL(debug) << "Original filesize: " << filesize; 574 | const uint32_t major_version = 2; 575 | const uint32_t minor_version = 1; 576 | const std::string magicbyte = "MBCR"; 577 | std::string header = create_header(method, filesize, major_version, 578 | minor_version, magicbyte, pagesize); 579 | std::string out_filename_processing(out_filename); 580 | out_filename_processing += ".processing"; 581 | std::ofstream final_file = std::ofstream(out_filename_processing.c_str(), 582 | std::ios::binary); 583 | final_file.write(header.c_str(), header.size()); 584 | 585 | // write body 586 | final_file << tmpfile2; 587 | // flush stream before renaming file 588 | final_file.close(); 589 | 590 | // remove the file if it already exists. This is faster than overwriting 591 | std::remove(out_filename); 592 | 593 | // remove the ".processing"-ending 594 | std::rename(out_filename_processing.c_str(), out_filename); 595 | BOOST_LOG_TRIVIAL(info) << "finished compressing file to " << out_filename; 596 | } 597 | 598 | } // namespace mscr 599 | --------------------------------------------------------------------------------