├── .gitignore ├── README.md ├── LICENSE ├── test └── DaemonTest.scala └── src └── Daemon.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | out/ 4 | lib/ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | file-pusher 2 | =========== 3 | 4 | A simple NIO based file synchronisation daemon. 5 | 6 | This thing simply copies files from a directory to a target directory, attempting to achieve the following goals: 7 | * Files are copied strictly in order of increasing mtime 8 | * Changes to files are detected via inotify (actually, via the Java NIO WatchService API that sits on top of it) 9 | * The daemon is robust to arbitrary concurrent modification to the "from" directory. However, we assume 10 | that the "to" directory is strictly under the control of the daemon. 11 | * Files and directories are copied atomically insofar as that is possible. So for example when copying 12 | a new file into an existing dir we copy to a temp location first and then atomically rename to replace 13 | any existing version, and likewise if we are copying a new directory into an existing dir. 14 | 15 | The code is a bit messy but the tests pass! 16 | 17 | The intended use case for this is if you have a large number of processes writing files somewhere (perhaps 18 | a local drive) and you want to automatically make this files available in some other location (perhaps a 19 | remote NFS shared drive), preserving the order in which files are created and not letting the remote location 20 | ever see a half-written file. 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Maximilian Bolingbroke All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, 4 | are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation and/or 11 | other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 20 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /test/DaemonTest.scala: -------------------------------------------------------------------------------- 1 | import java.lang.Thread.UncaughtExceptionHandler 2 | import java.nio.file.{Paths, Path, Files} 3 | import java.time.temporal.ChronoUnit 4 | import java.time.{Clock, Instant} 5 | 6 | import org.scalatest.FunSuite 7 | 8 | import scala.collection.mutable 9 | import scala.util.Random 10 | 11 | class DaemonTest extends FunSuite { 12 | trait Helper { 13 | def start() 14 | def quiesce() 15 | def assertEventually(pred : => Boolean) 16 | def remove(path : String) 17 | def touch(path : String, cookie : Int = 0) 18 | def fromPath(path : String) : Path 19 | def toPath(path : String) : Path 20 | } 21 | 22 | def withDaemon[A](act : Helper => A) = { 23 | val root = Files.createTempDirectory("daemon-test") 24 | val fromRoot = root.resolve("from") 25 | val toRoot = root.resolve("to") 26 | Files.createDirectory(fromRoot) 27 | @volatile var hasStarted = false 28 | val daemonThread = new Thread { 29 | override def run() = new Daemon(fromRoot, toRoot) 30 | } 31 | daemonThread.setUncaughtExceptionHandler(new UncaughtExceptionHandler { 32 | override def uncaughtException(t : Thread, e : Throwable) : Unit = e match { 33 | case (_ : InterruptedException) => () // Sure, whatever - this just means we are killing the thread 34 | case _ => { 35 | e.printStackTrace() 36 | System.exit(1) 37 | } 38 | } 39 | }) 40 | 41 | val helper = new Helper { 42 | override def start() : Unit = if (!hasStarted) { 43 | hasStarted = true 44 | daemonThread.start() 45 | } 46 | override def quiesce() = Thread.sleep(20 * 1000) // Very cheap and cheerful... 47 | override def assertEventually(pred : => Boolean) = { 48 | val limit = Instant.now().plus(20, ChronoUnit.SECONDS) 49 | while (Instant.now().isBefore(limit) && !pred) { 50 | Thread.sleep(100) 51 | } 52 | assert(pred) 53 | } 54 | override def remove(path: String) : Unit = Files.delete(fromRoot.resolve(path)) 55 | override def touch(path: String, cookie : Int) : Unit = { 56 | val where = fromRoot.resolve(path) 57 | Files.createDirectories(where.getParent) 58 | Files.write(where, Array.ofDim[Byte](cookie)) 59 | } 60 | override def fromPath(path : String) = fromRoot.resolve(path) 61 | override def toPath(path : String) = toRoot.resolve(path) 62 | } 63 | try { 64 | act(helper) 65 | } finally { 66 | if (hasStarted) { 67 | daemonThread.interrupt() 68 | daemonThread.join(10 * 1000) 69 | if (daemonThread.isAlive) throw new IllegalStateException("The daemon should have shut down..") 70 | } 71 | } 72 | } 73 | 74 | test("Can copy directory") { withDaemon { d => 75 | d.start() 76 | d.quiesce() 77 | d.touch("foo/bar") 78 | d.assertEventually { Files.isRegularFile(d.toPath("foo/bar")) } 79 | }} 80 | 81 | test("Can copy file") { withDaemon { d => 82 | d.start() 83 | d.quiesce() 84 | d.touch("foo") 85 | d.assertEventually { Files.isRegularFile(d.toPath("foo")) } 86 | }} 87 | 88 | test("Can replace file with file") { withDaemon { d => 89 | d.touch("foo") 90 | d.start() 91 | d.assertEventually { Files.isRegularFile(d.toPath("foo")) } 92 | d.touch("foo", cookie=1) 93 | d.assertEventually { Files.size(d.toPath("foo")) == 1 } 94 | }} 95 | 96 | test("Can replace file with directory") { withDaemon { d => 97 | d.touch("foo") 98 | d.start() 99 | d.remove("foo") 100 | d.touch("foo/bar") 101 | d.assertEventually { Files.isRegularFile(d.toPath("foo/bar")) } 102 | }} 103 | 104 | test("Can replace directory with file") { withDaemon { d => 105 | d.touch("foo/bar") 106 | d.start() 107 | d.remove("foo/bar") 108 | d.remove("foo") 109 | d.touch("foo") 110 | d.assertEventually { Files.isRegularFile(d.toPath("foo")) } 111 | }} 112 | 113 | test("Can cope with random series of file ops") { withDaemon { d => 114 | val random = new Random() 115 | 116 | def choose[A](from : TraversableOnce[A], satisfying : A => Boolean) = random.shuffle(from).find(satisfying) 117 | //.getOrElse(throw new IllegalStateException("No possibilty satisfied the predicate")) 118 | 119 | d.start() 120 | d.quiesce() 121 | 122 | // Mutable model of what we believe the FS should contain 123 | val directories = new mutable.ArrayBuffer[String]() 124 | val files = new mutable.HashMap[String, Int]() 125 | directories += "." 126 | 127 | def isEmptyDirectory(dir : String) = !directories.exists(_.startsWith(dir)) && !files.keys.exists(_.startsWith(dir)) 128 | 129 | // We deliberately choose a child path that is likely to collide with one we already created, because 130 | // that results in maximally mind-bending situations for the daemon to cope with 131 | def chooseChild() = choose(directories, (_ : String) => true).map { x => 132 | var i = 0 133 | while (Files.exists(d.fromPath(x).resolve("child" + i))) { 134 | i += 1 135 | } 136 | d.fromPath(x).resolve("child" + i) 137 | } 138 | 139 | // Let's say we want do to 30s of random ops. We wait 50ms between each op, so we need to do 600 ops on average. 140 | var cookie = 0 141 | while (random.nextFloat() < 1f - (1 / 1200f)) { 142 | Thread.sleep(50) 143 | random.nextInt(4) match { 144 | case 0 => choose(directories, isEmptyDirectory).foreach { x => 145 | directories -= x 146 | Files.delete(d.fromPath(x)) 147 | } 148 | case 1 => choose(files, (_ : (String, Int)) => true).foreach { case (x, _) => 149 | files.remove(x) 150 | Files.delete(d.fromPath(x)) 151 | } 152 | case 2 => chooseChild().foreach { p => 153 | directories += p.toString 154 | Files.createDirectory(p) 155 | } 156 | case 3 => chooseChild().foreach { p => 157 | files.put(p.toString, cookie) 158 | Files.write(p, Array.ofDim[Byte](cookie)) 159 | cookie += 1 160 | } 161 | } 162 | } 163 | 164 | d.assertEventually { 165 | directories.forall(x => Files.isDirectory(d.toPath(x))) 166 | files.forall { case (x, sz) => Files.size(d.toPath(x)) == sz } 167 | } 168 | }} 169 | } 170 | -------------------------------------------------------------------------------- /src/Daemon.scala: -------------------------------------------------------------------------------- 1 | import java.io.{FileNotFoundException, IOException} 2 | import java.lang.Thread.UncaughtExceptionHandler 3 | import java.nio.file._ 4 | import java.time.temporal.{ChronoUnit, TemporalUnit} 5 | import java.util.concurrent.TimeUnit 6 | import java.time.Instant 7 | 8 | import com.typesafe.scalalogging.StrictLogging 9 | 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.mutable 12 | 13 | case class Copy(fromMTime : Instant, from : Path, to : Path) extends Comparable[Copy] { 14 | def compareTo(that : Copy) = this.fromMTime.compareTo(that.fromMTime) 15 | } 16 | 17 | object Daemon extends App { 18 | val daemon = new Daemon(Paths.get("/Users/mbolingbroke/Junk/temp1"), Paths.get("/Users/mbolingbroke/Junk/temp2")) 19 | } 20 | 21 | class Daemon(fromRoot : Path, toRoot : Path) extends StrictLogging { 22 | val watchService = fromRoot.getFileSystem.newWatchService() 23 | // Ubuntu/Windows allegedly reports file changes almost instantly, but OS X at least seems to have a substantial delay 24 | // (around 5s?). For safety, say that we don't get notified until quite a while after the change has occurred. 25 | val WATCH_SERVICE_DELAY_SECONDS = 15 26 | 27 | val workQueue = new java.util.concurrent.PriorityBlockingQueue[Copy]() 28 | 29 | val isRegistered = new mutable.HashMap[Path, WatchKey]() 30 | def tryRegister(from : Path) = try { 31 | val watchKey = from.register(watchService, StandardWatchEventKinds.ENTRY_CREATE, StandardWatchEventKinds.ENTRY_MODIFY, StandardWatchEventKinds.OVERFLOW) 32 | isRegistered.put(from, watchKey) 33 | logger.info(s"Registered $from") 34 | Some(watchKey) 35 | } catch { 36 | case (e : NotDirectoryException) => None 37 | case (e : IOException) => { 38 | logger.info(s"Failed to register $from", e) 39 | None 40 | } 41 | } 42 | 43 | def getMTime(from : Path) = try { Files.getLastModifiedTime(from).toInstant } catch { case (_ : IOException) => Instant.ofEpochMilli(0) } 44 | 45 | def registerTree(from : Path) : Instant = { 46 | val watchKey = tryRegister(from) 47 | val mTime = getMTime(from) 48 | watchKey match { 49 | case None => mTime // Either a file or a directory that just went AWOL 50 | case Some(_) => registerTreeChildren(from, mTime) 51 | } 52 | } 53 | 54 | def registerTreeChildren(from : Path, mTime : Instant) : Instant 55 | = (Files.list(from).iterator().asScala.map(registerTree).toSeq :+ mTime).max 56 | 57 | // NB: this might enqueue work items that duplicate existing ones in the queue, the copy thread just has to deal with it 58 | def registerAndEnqueueWorkFromTree(from : Path, copyTo : Path) : Unit = { 59 | val watchKey = tryRegister(from) 60 | 61 | val enqueueCopyMTime = watchKey match { 62 | // Might be a file, might have gone away. Either way, let's try a copy 63 | case None => Some(getMTime(from)) 64 | case Some(_) if Files.isDirectory(copyTo) => { 65 | // Target path already exists: don't worry about making the directory + its contents appear atomically 66 | for (child <- Files.list(from).iterator().asScala) { 67 | registerAndEnqueueWorkFromTree(child, copyTo.resolve(from.relativize(child))) 68 | } 69 | None 70 | } 71 | // This seems to be a new directory: arrange for it to appear atomically. No need to copy children because that will be done by the dir copy. 72 | case Some(_) => Some(registerTreeChildren(from, getMTime(from))) 73 | } 74 | 75 | enqueueCopyMTime.foreach { mTime => 76 | workQueue.add(Copy(mTime, from, copyTo)) 77 | } 78 | } 79 | 80 | def registerAndEnqueueWorkFromTree(from : Path) : Unit = { 81 | registerAndEnqueueWorkFromTree(from, toRoot.resolve(fromRoot.relativize(from))) 82 | } 83 | 84 | def filesMayBeDifferent(from : Path, to : Path) = try { 85 | Files.size(to) != Files.size(from) || Files.isDirectory(to) != Files.isDirectory(from) 86 | } catch { 87 | case (_ : NoSuchFileException) => true 88 | } 89 | 90 | @volatile var seenAllFilesUpToMTime = Instant.now() 91 | registerAndEnqueueWorkFromTree(fromRoot, toRoot) 92 | 93 | val copyThread = new Thread { 94 | def copyChildren(from : Path, to : Path) : Unit = { 95 | for (fromChild <- Files.list(from).iterator().asScala) { 96 | val toChild = to.resolve(from.relativize(fromChild)) 97 | if (Files.isDirectory(fromChild)) { 98 | logger.info(s"Recursively copying child directory $fromChild to $toChild") 99 | Files.createDirectory(toChild) 100 | copyChildren(fromChild, toChild) 101 | } else if (filesMayBeDifferent(fromChild, toChild)) { 102 | logger.info(s"Recursively copying child file $fromChild to $toChild") 103 | try { 104 | Files.copy(fromChild, toChild) 105 | } catch { 106 | case (e : NoSuchFileException) => logger.info(s"It appears that $fromChild has been deleted before we got a chance to copy it", e) 107 | } 108 | } else { 109 | logger.info(s"Skipping recursive copy of child file $fromChild to $toChild because we don't seem to have to do anything") 110 | } 111 | } 112 | } 113 | 114 | def deleteChildren(to : Path) : Unit = { 115 | for (toChild <- Files.list(to).iterator().asScala) { 116 | if (Files.isDirectory(toChild)) { 117 | logger.info(s"Recursively deleting child directory $toChild") 118 | deleteChildren(toChild) 119 | } 120 | Files.delete(toChild) 121 | } 122 | } 123 | 124 | override def run() = { 125 | while (true) { 126 | val workItem = workQueue.poll(5, TimeUnit.SECONDS) 127 | if (workItem != null) { 128 | val Copy(mTime, from, to) = workItem 129 | 130 | if (!mTime.isBefore(seenAllFilesUpToMTime)) { 131 | // There might be as-yet-undiscovered files with earlier MTimes on the file system, so wait for 132 | // more info -- we don't want to push files out-of-order! 133 | Thread.sleep(1000) 134 | workQueue.add(workItem) 135 | } else { 136 | if (!Files.isDirectory(to.getParent)) { 137 | // Parent directory does not exist: this happens *exactly* in the case where we've enqueued some work 138 | // via registerAndEnqueueWorkFromTree(Path) in a directory that is registered with the service but 139 | // where we have not yet processed the initial work item that copies the contents of that directory. 140 | // 141 | // In this situation we just ignore the work item because it will get picked up eventually be that recursive copy. 142 | } else { 143 | try { 144 | val toTemp = if (Files.isDirectory(from)) { 145 | val toTemp = Files.createTempDirectory(to.getParent, ".filepusher." + from.getFileName.toString) 146 | logger.info(s"Copying directory $from to $to via temporary $toTemp") 147 | copyChildren(from, toTemp) 148 | Some(toTemp) 149 | } else if (filesMayBeDifferent(from, to)) { 150 | val toTemp = Files.createTempFile(to.getParent, ".filepusher." + from.getFileName.toString, "") 151 | logger.info(s"Copying file $from to $to via temporary $toTemp") 152 | try { 153 | Files.copy(from, toTemp, StandardCopyOption.REPLACE_EXISTING) 154 | Some(toTemp) 155 | } catch { 156 | case (e : NoSuchFileException) => { 157 | logger.info(s"It appears that $from has been deleted before we got a chance to copy it", e) 158 | None 159 | } 160 | } 161 | } else { 162 | logger.info(s"Skipping copy from $from to $to because we don't seem to have to do anything") 163 | None 164 | } 165 | toTemp.foreach { toTemp => 166 | if (Files.isDirectory(to)) { 167 | logger.info(s"Non-atomically replacing directory $to with $toTemp (alas, it is not possible to atomically replace directories)") 168 | deleteChildren(to) 169 | Files.delete(to) 170 | Files.move(toTemp, to) 171 | } else { 172 | logger.info(s"Atomically replacing $to with $toTemp") 173 | Files.move(toTemp, to, StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING) 174 | } 175 | } 176 | } catch { 177 | case (e : FileNotFoundException) => logger.info("File went missing during recursive copy: never mind", e) 178 | } 179 | } 180 | } 181 | } 182 | } 183 | } 184 | } 185 | 186 | copyThread.setName("copy") 187 | copyThread.setUncaughtExceptionHandler(new UncaughtExceptionHandler { 188 | override def uncaughtException(t : Thread, e : Throwable) = { 189 | logger.error(s"Thread ${t.getName} died, terminating VM!", e) 190 | System.exit(1) 191 | } 192 | }) 193 | copyThread.start() 194 | 195 | while (true) { 196 | val key = { 197 | val key = watchService.poll() 198 | if (key != null) key else { 199 | // No other watch events are enqueued ==> we should have seen all files with MTimes up to now 200 | seenAllFilesUpToMTime = Instant.now().minus(WATCH_SERVICE_DELAY_SECONDS, ChronoUnit.SECONDS) 201 | watchService.poll(5, TimeUnit.SECONDS) 202 | } 203 | } 204 | if (key != null) { 205 | val path = key.watchable().asInstanceOf[Path] 206 | 207 | key.pollEvents().asScala.foreach { event => 208 | logger.info(s"Got ${event.kind()} event for $path (context ${event.context()})") 209 | if (event.kind() == StandardWatchEventKinds.OVERFLOW) { 210 | registerAndEnqueueWorkFromTree(path) 211 | } else { 212 | val childFilename = event.context().asInstanceOf[Path] 213 | val childPath = path.resolve(childFilename) 214 | // We ignore ENTRY_MODIFY events reported against directories that we are already watching because they just 215 | // mean that the contents of that directory were changed. Because we recursively subscribe to all directories 216 | // anyway, we will be able to spot that fact from the ENTRY_MODIFY/ENTRY_CREATE for the actual nested file or 217 | // unmonitored subdirectory that has just changed. 218 | // 219 | // We have to be careful that we don't ignore the event in the case where childPath has just been deleted and 220 | // replaced with a file, this is done by checking the isValid flag of the registered key. 221 | val isDirectoryWeAreAlreadyWatching = isRegistered.contains(childPath) && isRegistered(childPath).isValid 222 | if (!isDirectoryWeAreAlreadyWatching || event.kind() != StandardWatchEventKinds.ENTRY_MODIFY) { 223 | registerAndEnqueueWorkFromTree(childPath) 224 | } 225 | } 226 | } 227 | 228 | // The key may be re-queued in the WatchService only if we reset it 229 | if (!key.reset()) { 230 | // The key has been auto-cancelled (probably because the file is now dead) 231 | isRegistered.remove(path) 232 | } 233 | } 234 | } 235 | 236 | // Work must be drained in an order, <:, compatible with the following axioms: 237 | // 1. If A and B are files where mtime(A) < mtime(B) then A <: B 238 | // 2. If A is strictly contained within B, and generation(A) <= generation(B), then A <: B 239 | // 240 | // Generation is a number assigned to each work item: 241 | // 1. All the work added as part of a single trawl is assigned to the same generation 242 | // 2. Generation number increments after each trawl 243 | // 244 | // NB: work is defined as: 245 | // a) File work: atomically replace the "to"-space file with the "from"-space one 246 | // b) Directory work: atomically create a "from"-space directory containing the initial contents of the "to"-space directory 247 | } 248 | --------------------------------------------------------------------------------