├── README.md ├── lib ├── server.rb ├── transmitter.rb └── updater.rb ├── rdfs.rb └── rdfsctl.rb /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Ruby Distributed File Sync (RDFS) 3 | 4 | Copyright (C) 2018 Sourcerer, All Rights Reserved 5 | Written by Robert W. Oliver II - 6 | 7 | ## OVERVIEW 8 | 9 | RDFS monitors for changes within a folder. Once these are detected, the files are SHA256 hashed and that hash, along with last-modified time is stored in an SQLite3 database. Upon changes, these hashes are updated. 10 | 11 | Other machines running RDFS can connect to one another and receive these updates, therefore keeping multiple directories across different machines in sync. 12 | 13 | Since the SHA256 hash is calculated, the system avoids saving the same block of data twice. This provides a basic data de-duplication scheme. 14 | 15 | While RDFS is functional, it is not an ideal construction of a high performance, production-ready distrubted file system. Its primary focus is to demonstrate the concepts involved in such system and serve as a teaching tool for these techniques. 16 | 17 | ## INSTALL 18 | 19 | To install requirements on a Debian based system, run: 20 | apt install ruby-sqlite3 ruby-daemons 21 | 22 | ## USE 23 | 24 | ruby rdfsctl.rb start 25 | 26 | ## LICENSE 27 | 28 | This software is licensed under the GPLv3 or later. 29 | 30 | ## BUGS 31 | 32 | There are several known bugs in this release: 33 | 34 | * Adding more than 2 nodes may produce unpredictable results 35 | * Compression for transfer was disabled due to Zlib issues 36 | * If database is out of sync with filesystem, unpredictable results will occur 37 | 38 | -------------------------------------------------------------------------------- /lib/server.rb: -------------------------------------------------------------------------------- 1 | module RDFS 2 | 3 | class Server < WEBrick::HTTPServlet::AbstractServlet 4 | 5 | attr_accessor :webrick 6 | attr_accessor :logger 7 | 8 | def initialize() 9 | 10 | # Setup logging inside the server 11 | @logger = Logger.new(STDOUT) 12 | if RDFS_DEBUG 13 | @logger.level == Logger::DEBUG 14 | else 15 | @logger.level == Logger::WARN 16 | end 17 | 18 | @webrick = WEBrick::HTTPServer.new :Port => RDFS_PORT 19 | @webrick.mount "/nodes", Nodes 20 | @webrick.mount "/files", Files 21 | @webrick.start 22 | end 23 | 24 | end 25 | 26 | class Files < WEBrick::HTTPServlet::AbstractServlet 27 | 28 | attr_accessor :logger 29 | 30 | # Process a POST request 31 | def do_POST(request, response) 32 | status, content_type, body = api_handler(request) 33 | response.status = status 34 | response['Content-Type'] = content_type 35 | response.body = body 36 | end 37 | 38 | private 39 | 40 | def api_handler(request) 41 | 42 | # We assume this by default, but can change it as the function progresses 43 | response_text = "OK" 44 | 45 | # Grab the IP of the requester 46 | ip = request.remote_ip 47 | 48 | case request.query['api_call'] 49 | when "add" 50 | filename = request.query['filename'] 51 | final_filename = RDFS_PATH + "/" + filename 52 | 53 | # Does the path exist? If not, create it. 54 | if filename.include?("/") 55 | FileUtils.mkdir_p(File.dirname(final_filename)) 56 | end 57 | 58 | # Decode, decompress, then save the file 59 | # We could use better compression, but for now this will work. 60 | File.write(final_filename, Base64.decode64(request.query['content'])) 61 | 62 | # Get SHA256 for database 63 | sha256sum = Digest::SHA256.file(final_filename).hexdigest 64 | 65 | # Add it to the local database with updated and deleted set to 0 so that 66 | # the client's transmitter won't try to send it to possibly non-existent nodes. 67 | query = RDFS_DB.prepare("INSERT INTO files (name, sha256, last_modified, updated, deleted) VALUES (:name, :sha256, :last_modified, :updated, :deleted)") 68 | query.bind_param('name', filename) 69 | query.bind_param('sha256', sha256sum) 70 | query.bind_param('last_modified', Time.now.to_i) 71 | query.bind_param('updated', '0') 72 | query.bind_param('deleted', '0') 73 | query.execute 74 | 75 | when "add_dup" 76 | filename = request.query['filename'] 77 | sha256sum = request.query['sha256sum'] 78 | 79 | # Grab the original filename 80 | query = DB.prepare("SELECT name FROM files WHERE sha256 = :sha256") 81 | query.bind_param('sha256', sha256sum) 82 | row = query.execute 83 | if row.count > 0 84 | old_name = RDFS_PATH + "/" + row[0] 85 | new_name = RDFS_PATH + "/" + filename 86 | FileUtils.cp(old_name, new_name) 87 | else 88 | # SHA256 not found 89 | # File deleted after query but before add_dup? 90 | response_text = "NOT_FOUND" 91 | end 92 | 93 | when "delete" 94 | # Delete file was called 95 | filename = request.query['filename'] 96 | full_filename = RDFS_PATH + "/" + filename 97 | # Does the file exist? 98 | if File.exists?(full_filename) 99 | # Is it a directory? If so, handle it separately. 100 | if File.directory?(full_filename) 101 | FileUtils.rmdir(full_filename) 102 | else 103 | # Force deletion of a file. 104 | FileUtils.rm_f(full_filename) 105 | end 106 | else 107 | response_text = "NOT_FOUND" 108 | end 109 | 110 | when "add_query" 111 | # Check if duplicate exists 112 | sha256sum = request.query['sha256sum'] 113 | query = RDFS_DB.prepare("SELECT sha256 FROM files WHERE sha256 = :sha256") 114 | query.bind_param('sha256', sha256sum) 115 | row = query.execute 116 | if row.count > 0 117 | response_text = "EXISTS" 118 | else 119 | response_text = "NOT_FOUND" 120 | end 121 | end 122 | 123 | return 200, "text/plain", response_text 124 | end 125 | 126 | end 127 | 128 | class Nodes < WEBrick::HTTPServlet::AbstractServlet 129 | 130 | attr_accessor :logger 131 | 132 | # Process a POST request 133 | def do_POST(request, response) 134 | status, content_type, body = api_handler(request) 135 | response.status = status 136 | response['Content-Type'] = content_type 137 | response.body = body 138 | end 139 | 140 | private 141 | 142 | def api_handler(request) 143 | 144 | # We assume this by default, but can change it as the function progresses 145 | response_text = "OK" 146 | 147 | # Grab the IP of the requester 148 | ip = request.remote_ip 149 | 150 | case request.query['api_call'] 151 | # Add a node 152 | when "add_node" 153 | query = RDFS_DB.prepare("SELECT ip FROM nodes WHERE ip = :ip") 154 | query.bind_param('ip', ip) 155 | row = query.execute 156 | unless row.count > 0 157 | query = RDFS_DB.prepare("INSERT INTO nodes (ip) VALUES (:ip)") 158 | query.bind_param('ip', ip) 159 | query.execute 160 | response_text = "Node with IP " + ip + " added.\n" 161 | else 162 | response_text = "Node with IP " + ip + " was already registered.\n" 163 | end 164 | # Remove a node 165 | when "delete_node" 166 | query = RDFS_DB.prepare("DELETE FROM nodes WHERE ip = :ip") 167 | query.bind_param('ip', ip) 168 | query.execute 169 | response_text = "Node with IP " + ip + " removed.\n" 170 | end 171 | 172 | return 200, "text/plain", response_text 173 | end 174 | 175 | # Create SHA256 of a file 176 | def sha256file(file) 177 | return Digest::SHA256.file(file).hexdigest 178 | end 179 | 180 | end 181 | 182 | end 183 | -------------------------------------------------------------------------------- /lib/transmitter.rb: -------------------------------------------------------------------------------- 1 | module RDFS 2 | 3 | class Transmitter 4 | 5 | attr_accessor :main_thread 6 | 7 | # Called upon Transmitter.new 8 | def initialize(transmit_frequency) 9 | @transmit_frequency = transmit_frequency 10 | @running = 1 11 | 12 | # Setup logging inside the updater 13 | @logger = Logger.new(STDOUT) 14 | if RDFS_DEBUG 15 | @logger.level == Logger::DEBUG 16 | else 17 | @logger.level == Logger::WARN 18 | end 19 | 20 | # Create the thread 21 | @main_thread = Thread.new kernel 22 | @logger.debug("Transmitter thread started.") 23 | end 24 | 25 | # Stop the transmitter 26 | def stop 27 | @running = nil 28 | end 29 | 30 | private 31 | 32 | attr_writer :running 33 | attr_accessor :logger 34 | 35 | def kernel 36 | while @running 37 | # Transmit 38 | transmit 39 | 40 | Thread.pass 41 | sleep @transmit_frequency 42 | end 43 | end 44 | 45 | # Reads a binary file and returns its contents in a string 46 | def read_file(file) 47 | file = File.open(file, "rb") 48 | return file.read 49 | end 50 | 51 | # Create SHA256 of a file 52 | def sha256file(file) 53 | return Digest::SHA256.file(file).hexdigest 54 | end 55 | 56 | # Transmit 57 | def transmit 58 | # First, check to see if there are any active nodes. If not, there's no 59 | # point in wasting DB time in checking for updated files. 60 | # This could use some refactoring. 61 | sql = "SELECT * FROM nodes" 62 | nodes_row = RDFS_DB.execute(sql) 63 | if nodes_row.count > 0 64 | sql = "SELECT * FROM files WHERE updated != 0 OR deleted != 0" 65 | @logger.debug("transmitter: " + sql) 66 | row = RDFS_DB.execute(sql) 67 | if row.count > 0 68 | nodes_row.each do |node| 69 | row.each do |file| 70 | ip = node[0] 71 | sha256sum = file[0] 72 | filename = file[1] 73 | updated = file[3] 74 | deleted = file[4] 75 | # Check to see if the file exists using some other filename. 76 | # If it does, we make a call to add without actually sending the file. 77 | uri = URI.parse('http://' + ip + ':' + RDFS_PORT.to_s + '/files') 78 | if (updated != 0) && (deleted == 0) 79 | # UPDATE 80 | begin 81 | response = Net::HTTP.post_form(uri, 82 | 'api_call' => 'add_query', 83 | 'filename' => filename, 84 | 'sha256sum' => sha256sum) 85 | if response.body.include?("EXISTS") 86 | # File exists but with a different filename, so call the add_dup 87 | # function to avoid using needless bandwidth 88 | response = Net::HTTP.post_form(uri, 89 | 'api_call' => 'add_dup', 90 | 'filename' => filename, 91 | 'sha256sum' => sha256sum) 92 | if response.body.include?("OK") 93 | clear_update_flag(filename) 94 | end 95 | else 96 | # File doesn't exist on node, so let's push it. 97 | # Read it into a string (this will have to be improved at some point) 98 | file_contents = read_file(RDFS_PATH + "/" + filename) 99 | file_contents = Base64.encode64(file_contents) 100 | # Then push it in a POST call 101 | response = Net::HTTP.post_form(uri, 102 | 'api_call' => 'add', 103 | 'filename' => filename, 104 | 'sha256sum' => sha256sum, 105 | 'content' => file_contents) 106 | if response.body.include?("OK") 107 | clear_update_flag(filename) 108 | end 109 | end 110 | rescue 111 | @logger.debug("transmitter: Unable to connect to node at IP " + ip + ".") 112 | end 113 | end 114 | if (deleted != 0) 115 | # DELETED 116 | begin 117 | response = Net::HTTP.post_form(uri, 118 | 'api_call' => 'delete', 119 | 'filename' => filename) 120 | if response.body.include?("OK") 121 | clear_update_flag(filename) 122 | end 123 | rescue 124 | @logger.debug("transmitter: Unable to connect to node at IP " + ip + ".") 125 | end 126 | end 127 | end 128 | end 129 | end 130 | end 131 | end 132 | 133 | # Reads a binary file and returns its contents in a string 134 | def read_file(file) 135 | file = File.open(file, "rb") 136 | return file.read 137 | end 138 | 139 | # Clears the updated/deleted flags 140 | def clear_update_flag(filename) 141 | sql = "UPDATE files SET updated = 0, deleted = 0 WHERE name = '" + filename + "'" 142 | @logger.debug("transmitter: " + sql) 143 | RDFS_DB.execute(sql) 144 | end 145 | 146 | end 147 | 148 | end 149 | -------------------------------------------------------------------------------- /lib/updater.rb: -------------------------------------------------------------------------------- 1 | module RDFS 2 | 3 | class Updater 4 | 5 | attr_accessor :update_frequency 6 | attr_accessor :main_thread 7 | 8 | # Called upon Updater.new 9 | def initialize(update_frequency) 10 | @update_frequency = update_frequency 11 | @running = 1 12 | 13 | # Setup logging inside the updater 14 | @logger = Logger.new(STDOUT) 15 | if RDFS_DEBUG 16 | @logger.level == Logger::DEBUG 17 | else 18 | @logger.level == Logger::WARN 19 | end 20 | 21 | # Create the main thread 22 | @main_thread = Thread.new kernel 23 | @logger.debug("Updater thread started.") 24 | end 25 | 26 | # Stop the updater 27 | def stop 28 | @running = nil 29 | end 30 | 31 | private 32 | 33 | attr_writer :running 34 | attr_accessor :logger 35 | 36 | def kernel 37 | while @running 38 | update_database 39 | Thread.pass 40 | sleep @update_frequency 41 | end 42 | end 43 | 44 | # Create SHA256 of a file 45 | def sha256file(file) 46 | return Digest::SHA256.file(file).hexdigest 47 | end 48 | 49 | # Return a tree of the specified path 50 | def fetch_tree(path) 51 | result = Array.new 52 | Find.find(path) { |e| result << e.sub(RDFS_PATH + "/", "") if e != RDFS_PATH && ! File.directory?(e) } 53 | return result 54 | end 55 | 56 | # Update database with files 57 | def update_database 58 | 59 | check_for_deleted_files 60 | 61 | # Fetch a list of all files 62 | files = fetch_tree(RDFS_PATH) 63 | @logger.debug("updater: There are currently " + files.size.to_s + " entries in " + RDFS_PATH) 64 | 65 | # Iterate through each entry and check to see if it is in the database 66 | files.each do |f| 67 | 68 | # Reconstruct full path and get last modified time 69 | full_filename = RDFS_PATH + "/" + f 70 | last_modified = File.mtime(full_filename) 71 | updated = nil 72 | 73 | # If it's not in the database, hash it and add it to the DB 74 | row = RDFS_DB.execute("SELECT * FROM files WHERE name = '" + f + "'") 75 | if row.count == 0 76 | # It wasn't in the database, so add it 77 | file_hash = sha256file(full_filename) 78 | sql = "INSERT INTO files (sha256, name, last_modified, updated, deleted) VALUES ('" + file_hash + "', '" + f + "', " + last_modified.to_i.to_s + ", 1, 0)" 79 | @logger.debug("updater: " + sql) 80 | RDFS_DB.execute(sql) 81 | else 82 | # It was in the database, so see if it has changed. 83 | if last_modified.to_i > row[0][2].to_i 84 | # File has changed. Rehash it and updated the database. 85 | file_hash = sha256file(full_filename) 86 | sql = "UPDATE files SET sha256 = '" + file_hash + "', last_modified = " + last_modified.to_i.to_s + ", updated = 1, deleted = 0 WHERE name = '" + f + "'" 87 | @logger.debug("updater: " + sql) 88 | RDFS_DB.execute(sql) 89 | end 90 | end 91 | end 92 | end 93 | 94 | def check_for_deleted_files 95 | # Check for deleted files 96 | sql = "SELECT name FROM files WHERE updated = 0 AND deleted = 0" 97 | @logger.debug("updater: " + sql) 98 | all_files = RDFS_DB.execute(sql) 99 | if all_files.count > 0 100 | all_files.each do |f| 101 | filename = f[0] 102 | full_filename = RDFS_PATH + "/" + filename 103 | unless File.exists?(full_filename) 104 | # File doesn't exist, so mark it deleted 105 | sql = "UPDATE files SET deleted = 1 WHERE name = '" + filename + "'" 106 | @logger.debug("updater: " + sql) 107 | RDFS_DB.execute(sql) 108 | end 109 | end 110 | end 111 | end 112 | 113 | end 114 | 115 | end 116 | -------------------------------------------------------------------------------- /rdfs.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # RDFS - Ruby Distributed File Sync 4 | # Copyright (C) 2018 Sourcerer, All Rights Reserved 5 | # Written by Robert W. Oliver II - 6 | # 7 | # -- BASIC OVERVIEW -- 8 | # 9 | # RDFS monitors for changes within a folder. Once these are detected, 10 | # the files are SHA256 hashed and that hash, along with last-modified 11 | # time is stored in an SQLite3 database. Upon changes, these hashes 12 | # are updated. 13 | # 14 | # Other machines running RDFS can connect to one another and receive 15 | # these updates, therefore keeping multiple directories across different 16 | # machines in sync. 17 | # 18 | # Since the SHA256 hash is calculated, the system avoids saving the same 19 | # block of data twice. This provides a basic data de-duplication scheme. 20 | # 21 | # While RDFS is functional, it is not an ideal construction of a high 22 | # performance, production-ready distrubted file system. Its primary 23 | # focus is to demonstrate the concepts involved in such system and 24 | # serve as a teaching tool for these techniques. 25 | # 26 | # -- LICENSE -- 27 | # 28 | # This software is licensed under the GPLv3 or later. 29 | # 30 | 31 | # To install requirements on a Debian based system, run: 32 | # apt install ruby-sqlite3 ruby-daemons 33 | 34 | require 'digest' 35 | require 'sqlite3' 36 | require 'find' 37 | require 'logger' 38 | require 'webrick' 39 | require 'uri' 40 | require 'net/http' 41 | require 'zlib' 42 | require 'base64' 43 | 44 | require_relative 'lib/updater' 45 | require_relative 'lib/transmitter' 46 | require_relative 'lib/server' 47 | 48 | module RDFS 49 | 50 | # CTRL+C Handler 51 | trap("SIGINT") do 52 | puts "\nRDFS Shutdown via CTRL+C." 53 | exit 130 54 | end 55 | 56 | # If debug is enabled, output will be quite verbose. 57 | RDFS_DEBUG = 1 58 | 59 | # Default RDFS path 60 | RDFS_PATH = Dir.home + "/rdfs" 61 | 62 | # SQLite3 database file 63 | RDFS_DB_FILE = Dir.home + "/.rdfs.sqlite3" 64 | 65 | # SQLite3 schema 66 | RDFS_SCHEMA_FILES = " 67 | CREATE TABLE files ( 68 | sha256 VARCHAR(64), 69 | name VARCHAR(255), 70 | last_modified INT, 71 | updated INT, 72 | deleted INT);" 73 | RDFS_SCHEMA_NODES = " 74 | CREATE TABLE nodes ( 75 | ip VARCHAR(15));" 76 | 77 | # RDFS path update frequency (in seconds) 78 | RDFS_UPDATE_FREQ = 10 79 | 80 | # RDFS transmit frequency (in seconds) 81 | RDFS_TRANSMIT_FREQ = 5 82 | 83 | # RDFS listen port 84 | RDFS_PORT = 47656 85 | 86 | # Setup logging 87 | logger = Logger.new(STDOUT) 88 | if RDFS_DEBUG 89 | logger.level == Logger::DEBUG 90 | else 91 | logger.level == Logger::WARN 92 | end 93 | 94 | # Output startup message 95 | puts "RDFS - Ruby Distributed File Sync\n"\ 96 | "Copyright (C) 2018 Sourcerer, All Rights Reserved.\n"\ 97 | "Written by Robert W. Oliver II. Licensed under the GPLv3.\n\n" 98 | 99 | # If the database doesn't exist, create it. 100 | unless File.exists?(RDFS_DB_FILE) 101 | db = SQLite3::Database.new RDFS_DB_FILE 102 | db.execute RDFS_SCHEMA_FILES 103 | db.execute RDFS_SCHEMA_NODES 104 | db.close 105 | logger.info("RDFS database was not found, so it was created.") 106 | end 107 | 108 | # Does file storage area exist? If not, create it. 109 | unless Dir.exists?(RDFS_PATH) 110 | Dir.mkdir(RDFS_PATH) 111 | logger.info("RDFS directory " + RDFS_PATH + " not found, so it was created.") 112 | end 113 | 114 | # Open the database 115 | RDFS_DB = SQLite3::Database.open RDFS_DB_FILE 116 | 117 | # Even in production, it's better for RDFS to crash than to have threads die 118 | # and never run again. Makes it easier to track down issues. 119 | Thread.abort_on_exception = true 120 | 121 | # Start the server 122 | Thread.new do 123 | @server = Server.new 124 | end 125 | sleep 1 126 | 127 | # Start the updater 128 | Thread.new do 129 | @updater = Updater.new(RDFS_UPDATE_FREQ) 130 | end 131 | sleep 1 132 | 133 | # Start the transmitter 134 | @transmitter = Transmitter.new(RDFS_TRANSMIT_FREQ) 135 | 136 | puts "RDFS Shutdown." 137 | 138 | end 139 | 140 | -------------------------------------------------------------------------------- /rdfsctl.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'daemons' 4 | 5 | Daemons.run('rdfs.rb') 6 | 7 | --------------------------------------------------------------------------------