├── LICENSE ├── README.md ├── pom.xml └── src ├── main └── java │ └── nbdfdb │ ├── FDBArray.java │ ├── FDBBitSet.java │ ├── FDBStorage.java │ ├── NBD.java │ ├── NBDServer.java │ ├── NBDVolumeServer.java │ ├── Storage.java │ └── cli │ ├── CreateCommand.java │ ├── DeleteCommand.java │ ├── ListCommand.java │ ├── NBDCLI.java │ ├── ServerCommand.java │ └── SnapshotCommand.java └── test └── java └── nbdfdb └── FDBArrayTest.java /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Sam Pullara 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | Copyright 2018 Sam Pullara 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | Network Block Device 19 | ==================== 20 | 21 | The nbd-client on Linux allows you to mount a remote server implementing the NBD protocol 22 | as a local block device. This NBDServer exports volumes whose data is stored in 23 | FoundationDB. This gives you a highly scalable, high performance, reliable block device that 24 | you can then format and put a filesystem on it. 25 | 26 | The NBDCLI command allows you to create new volume, list the volumes you have in your system, 27 | delete a volume or snapshot a current volume to another volume. 28 | 29 | HOWTO 30 | ===== 31 | 32 | Bring up FoundationDB and then run the NBDServer. It will be listening on the default 10809 port. 33 | 34 | ```bash 35 | java -jar nbdcli.jar server 36 | ``` 37 | 38 | Create a new 1G volume: 39 | 40 | ```bash 41 | java -jar nbdcli.jar create -n [volume name] -s 1G 42 | ``` 43 | 44 | On a Linux host, install ndb, create the block device, format it and mount it: 45 | 46 | ```bash 47 | sudo apt-get update && apt-get install nbd 48 | sudo modprobe nbd 49 | sudo nbd-client -N [volume name] [host] /dev/nbd0 50 | sudo mkfs.xfs /dev/nbd0 51 | mkdir tmp 52 | sudo mount /dev/nbd0 tmp 53 | ``` 54 | 55 | You may need to change the ownership on that directory to access it but you can now save files 56 | there and they will be backed by FoundationDB. Each volume can only be shared to a single nbd client 57 | at a time. 58 | 59 | Under the covers 60 | ================ 61 | 62 | Each volume is a sparse array of bytes (FDBArray) stored in FoundationDB across many rows in the database. In addition 63 | each volume can have a parent whose sparse array shows through where the child volume hasn't written yet. Here is the 64 | interface that we implement: 65 | 66 | ```java 67 | public interface Storage { 68 | void connect(); 69 | 70 | void disconnect(); 71 | 72 | CompletableFuture read(byte[] buffer, long offset); 73 | 74 | CompletableFuture write(byte[] buffer, long offset); 75 | 76 | CompletableFuture flush(); 77 | 78 | long size(); 79 | 80 | long usage(); 81 | } 82 | ``` 83 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.sampullara.nbd 23 | nbd-fdb 24 | 0.1-SNAPSHOT 25 | jar 26 | 27 | nbd-fdb 28 | http://maven.apache.org 29 | 30 | 31 | 32 | 33 | org.foundationdb 34 | fdb-java 35 | 7.3.43 36 | 37 | 38 | org.hdrhistogram 39 | HdrHistogram 40 | 1.2.1 41 | test 42 | 43 | 44 | com.github.spullara.cli-parser 45 | cli-parser 46 | 1.1.2 47 | 48 | 49 | io.netty 50 | netty-all 51 | 4.0.23.Final 52 | 53 | 54 | com.google.guava 55 | guava 56 | [17.0.0,) 57 | 58 | 59 | junit 60 | junit 61 | 4.11 62 | test 63 | 64 | 65 | org.roaringbitmap 66 | RoaringBitmap 67 | 0.9.0 68 | 69 | 70 | 71 | 72 | ${project.build.directory}/endorsed 73 | 74 | 75 | UTF-8 76 | UTF-8 77 | 78 | 79 | 4.10 80 | 81 | 1.7.2 82 | 1.2.16 83 | 84 | 1.8 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | maven-assembly-plugin 93 | 2.4 94 | 95 | 96 | org.codehaus.mojo 97 | cobertura-maven-plugin 98 | 2.6 99 | 100 | true 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | maven-compiler-plugin 109 | 3.1 110 | 111 | ${java.version} 112 | ${java.version} 113 | 114 | 115 | 116 | maven-release-plugin 117 | 2.5 118 | 119 | 120 | true 121 | 122 | 123 | 124 | 125 | maven-assembly-plugin 126 | 2.4 127 | 128 | nbdcli 129 | false 130 | 131 | 132 | nbdfdb.cli.NBDCLI 133 | 134 | 135 | 136 | jar-with-dependencies 137 | 138 | 139 | 140 | 141 | make-assembly 142 | 143 | package 144 | 145 | 146 | single 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/FDBArray.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.apple.foundationdb.*; 21 | import com.apple.foundationdb.directory.DirectoryLayer; 22 | import com.apple.foundationdb.directory.DirectorySubspace; 23 | import com.apple.foundationdb.tuple.Tuple; 24 | import com.google.common.primitives.Ints; 25 | import com.google.common.primitives.Longs; 26 | 27 | import java.util.HashSet; 28 | import java.util.List; 29 | import java.util.Set; 30 | import java.util.concurrent.CompletableFuture; 31 | import java.util.concurrent.ExecutionException; 32 | import java.util.function.Function; 33 | 34 | import static java.util.Arrays.asList; 35 | import static java.util.Collections.singletonList; 36 | 37 | /** 38 | * Block storage in FDB 39 | */ 40 | public class FDBArray { 41 | 42 | // FDB 43 | private static final byte[] ONE = new byte[]{0, 0, 0, 0, 0, 0, 0, 1}; 44 | private static final byte[] MINUS_ONE = new byte[]{0, 0, 0, 0, 0, 0, 0, -1}; 45 | private static DirectoryLayer dl = DirectoryLayer.getDefault(); 46 | 47 | // Metadata keys 48 | private static final String BLOCK_SIZE_KEY = "block_size"; 49 | private static final String PARENT_KEY = "parent"; 50 | private static final String PARENT_TIMESTAMP_KEY = "parent_timestamp"; 51 | private static final String DEPENDENTS = "dependents"; 52 | private static final String BLOCKS = "blocks"; 53 | 54 | // Location in the database 55 | private final DirectorySubspace metadata; 56 | private final DirectorySubspace data; 57 | private final Database database; 58 | private final int blockSize; 59 | private final FDBArray parentArray; 60 | private final DirectorySubspace ds; 61 | private final Long snapshot; 62 | private final FDBBitSet usedBlocks; 63 | 64 | // Keys 65 | private byte[] dependents; 66 | 67 | // Used for copies 68 | private final ThreadLocal buffer = new ThreadLocal() { 69 | @Override 70 | protected byte[] initialValue() { 71 | return new byte[blockSize]; 72 | } 73 | }; 74 | 75 | public static FDBArray open(Database database, String name) { 76 | DirectorySubspace ds = get(dl.open(database, asList("com.sampullara.fdb.array", name))); 77 | return new FDBArray(database, ds); 78 | } 79 | 80 | public static FDBArray open(Database database, String name, long timestamp) { 81 | DirectorySubspace ds = get(dl.open(database, asList("com.sampullara.fdb.array", name))); 82 | return new FDBArray(database, ds, timestamp); 83 | } 84 | 85 | public static FDBArray create(Database database, String name, int blockSize) { 86 | DirectorySubspace ds = get(dl.create(database, asList("com.sampullara.fdb.array", name))); 87 | return create(database, ds, blockSize, null, 0); 88 | } 89 | 90 | protected static FDBArray create(Database database, DirectorySubspace ds, int blockSize, DirectorySubspace parent, long timestamp) { 91 | DirectorySubspace metadata = get(ds.create(database, singletonList("metadata"))); 92 | if (parent != null) { 93 | List parentPath = parent.getPath(); 94 | database.run((Function) tx -> { 95 | tx.set(metadata.get(PARENT_KEY).pack(), Tuple.fromList(parentPath).pack()); 96 | tx.set(metadata.get(PARENT_TIMESTAMP_KEY).pack(), Tuple.from(timestamp).pack()); 97 | return null; 98 | }); 99 | } 100 | database.run((Function) tx -> { 101 | tx.set(metadata.get(BLOCK_SIZE_KEY).pack(), Ints.toByteArray(blockSize)); 102 | return null; 103 | }); 104 | return new FDBArray(database, ds); 105 | } 106 | 107 | private static T get(CompletableFuture future) { 108 | try { 109 | return future.get(); 110 | } catch (ExecutionException | InterruptedException e) { 111 | throw new RuntimeException(e); 112 | } 113 | } 114 | 115 | protected FDBArray(Database database, DirectorySubspace ds, Long snapshot) { 116 | this.ds = ds; 117 | this.snapshot = snapshot; 118 | this.database = database; 119 | this.metadata = get(ds.createOrOpen(database, singletonList("metadata"))); 120 | this.data = get(ds.createOrOpen(database, singletonList("data"))); 121 | Integer currentBlocksize = database.run(tx -> { 122 | byte[] key = metadata.get(BLOCK_SIZE_KEY).pack(); 123 | byte[] currentBlockSize = get(tx.get(key)); 124 | if (currentBlockSize == null) { 125 | return null; 126 | } else { 127 | return Ints.fromByteArray(currentBlockSize); 128 | } 129 | }); 130 | if (currentBlocksize == null) { 131 | throw new IllegalArgumentException("Block size for array not configured"); 132 | } else { 133 | blockSize = currentBlocksize; 134 | } 135 | parentArray = database.run(tx -> { 136 | byte[] parentPathValue = get(tx.get(metadata.get(PARENT_KEY).pack())); 137 | byte[] parentTimestampBytes = get(tx.get(metadata.get(PARENT_TIMESTAMP_KEY).pack())); 138 | if (parentPathValue == null) { 139 | return null; 140 | } else { 141 | List items = (List) Tuple.fromBytes(parentPathValue).getItems(); 142 | long parentTimestamp = Tuple.fromBytes(parentTimestampBytes).getLong(0); 143 | return new FDBArray(database, get(DirectoryLayer.getDefault().open(database, items)), parentTimestamp); 144 | } 145 | }); 146 | dependents = metadata.get(DEPENDENTS).pack(); 147 | usedBlocks = new FDBBitSet(database, metadata.get(BLOCKS), 512); 148 | } 149 | 150 | protected FDBArray(Database database, DirectorySubspace ds) { 151 | this(database, ds, null); 152 | } 153 | 154 | public CompletableFuture write(byte[] write, long offset) { 155 | if (snapshot != null) { 156 | throw new IllegalStateException("FDBArray is read only"); 157 | } 158 | return database.runAsync(tx -> { 159 | // Use a single buffer for all full blocksize writes 160 | byte[] bytes = buffer.get(); 161 | 162 | // Calculate the block locations 163 | int length = write.length; 164 | long firstBlock = offset / blockSize; 165 | long lastBlock = (offset + length) / blockSize; 166 | int blockOffset = (int) (offset % blockSize); 167 | int shift = blockSize - blockOffset; 168 | 169 | // Track where we have written so we can estimate usage later 170 | usedBlocks.set(firstBlock, lastBlock); 171 | 172 | // Special case first block and last block 173 | byte[] firstBlockKey = data.get(firstBlock).get(System.currentTimeMillis()).pack(); 174 | if (blockOffset > 0 || (blockOffset == 0 && length < blockSize)) { 175 | // Only need to do this if the first block is partial 176 | byte[] readBytes = new byte[blockSize]; 177 | read(tx, firstBlock * blockSize, readBytes, Long.MAX_VALUE, null); 178 | int writeLength = Math.min(length, shift); 179 | System.arraycopy(write, 0, readBytes, blockOffset, writeLength); 180 | tx.set(firstBlockKey, readBytes); 181 | } else { 182 | // In this case copy the full first block blindly 183 | System.arraycopy(write, 0, bytes, 0, blockSize); 184 | tx.set(firstBlockKey, bytes); 185 | } 186 | // If there is more than one block 187 | if (lastBlock > firstBlock) { 188 | // For the blocks in the middle we can just blast values in without looking at the current bytes 189 | for (long i = firstBlock + 1; i < lastBlock; i++) { 190 | byte[] key = data.get(i).get(System.currentTimeMillis()).pack(); 191 | int writeBlock = (int) (i - firstBlock); 192 | int position = (writeBlock - 1) * blockSize + shift; 193 | System.arraycopy(write, position, bytes, 0, blockSize); 194 | tx.set(key, bytes); 195 | } 196 | int position = (int) ((lastBlock - firstBlock - 1) * blockSize + shift); 197 | int lastBlockLength = length - position; 198 | byte[] lastBlockKey = data.get(lastBlock).get(System.currentTimeMillis()).pack(); 199 | // If the last block is a complete block we don't need to read 200 | if (lastBlockLength == blockSize) { 201 | System.arraycopy(write, position, bytes, 0, blockSize); 202 | tx.set(lastBlockKey, bytes); 203 | } else { 204 | byte[] readBytes = new byte[blockSize]; 205 | read(tx, lastBlock * blockSize, readBytes, Long.MAX_VALUE, null); 206 | System.arraycopy(write, position, readBytes, 0, lastBlockLength); 207 | tx.set(lastBlockKey, readBytes); 208 | } 209 | } 210 | return CompletableFuture.completedFuture(null); 211 | }); 212 | } 213 | 214 | public CompletableFuture usage() { 215 | return usedBlocks.count().thenApply(usedBlocks -> usedBlocks * blockSize); 216 | } 217 | 218 | /** 219 | * Read latest blocks. 220 | * 221 | * @param read 222 | * @param offset 223 | * @return 224 | */ 225 | public CompletableFuture read(byte[] read, long offset) { 226 | return read(read, offset, Long.MAX_VALUE); 227 | } 228 | 229 | /** 230 | * Read blocks as of a particular timestamp. 231 | * 232 | * @param read 233 | * @param offset 234 | * @param timestamp 235 | * @return 236 | */ 237 | public CompletableFuture read(byte[] read, long offset, long timestamp) { 238 | return database.runAsync(tx -> { 239 | read(tx, offset, read, timestamp, null); 240 | return CompletableFuture.completedFuture(null); 241 | }); 242 | } 243 | 244 | static class BlocksRead { 245 | private final int total; 246 | private Set blocksRead; 247 | 248 | BlocksRead(int total) { 249 | this.total = total; 250 | blocksRead = new HashSet<>(total); 251 | } 252 | 253 | boolean done() { 254 | return blocksRead.size() == total; 255 | } 256 | 257 | boolean read(long block) { 258 | return blocksRead.add(block); 259 | } 260 | } 261 | 262 | private void read(ReadTransaction tx, long offset, byte[] read, long readTimestamp, BlocksRead blocksRead) { 263 | long snapshotTimestamp = snapshot == null ? readTimestamp : Math.min(readTimestamp, snapshot); 264 | long firstBlock = offset / blockSize; 265 | int blockOffset = (int) (offset % blockSize); 266 | int length = read.length; 267 | long lastBlock = (offset + length) / blockSize; 268 | long currentBlockId = -1; 269 | byte[] currentValue = null; 270 | if (parentArray != null && blocksRead == null) { 271 | blocksRead = new BlocksRead((int) (lastBlock - firstBlock + 1)); 272 | } 273 | for (KeyValue keyValue : tx.getRange(data.get(firstBlock).pack(), data.get(lastBlock + 1).pack())) { 274 | Tuple keyTuple = data.unpack(keyValue.getKey()); 275 | long blockId = keyTuple.getLong(0); 276 | if (blockId != currentBlockId && currentBlockId != -1) { 277 | // Only copy blocks that we are going to use 278 | copy(read, firstBlock, blockOffset, lastBlock, currentValue, currentBlockId, blocksRead); 279 | currentValue = null; 280 | } 281 | // Advance the current block id 282 | currentBlockId = blockId; 283 | // Update the current value with the latest value not written after the snapshot timestamp 284 | long timestamp = keyTuple.getLong(1); 285 | if (timestamp <= snapshotTimestamp) { 286 | currentValue = keyValue.getValue(); 287 | } 288 | } 289 | copy(read, firstBlock, blockOffset, lastBlock, currentValue, currentBlockId, blocksRead); 290 | if (parentArray != null && !blocksRead.done()) { 291 | // This is currently less efficient than I would like. Basically you should do the other reads 292 | // and only call the parent when there are gaps. Instead, we are calling all parents for 293 | // all reads and that just scales poorly as you make a deeper hierarchy. 294 | parentArray.read(tx, offset, read, snapshotTimestamp, blocksRead); 295 | } 296 | } 297 | 298 | private void copy(byte[] read, long firstBlock, int blockOffset, long lastBlock, byte[] currentValue, long blockId, BlocksRead blocksRead) { 299 | if (currentValue != null) { 300 | if (blocksRead == null || blocksRead.read(blockId)) { 301 | int blockPosition = (int) ((blockId - firstBlock) * blockSize); 302 | int shift = blockSize - blockOffset; 303 | if (blockId == firstBlock) { 304 | int firstBlockLength = Math.min(shift, read.length); 305 | System.arraycopy(currentValue, blockOffset, read, 0, firstBlockLength); 306 | } else { 307 | int position = blockPosition - blockSize + shift; 308 | if (blockId == lastBlock) { 309 | int lastLength = read.length - position; 310 | System.arraycopy(currentValue, 0, read, position, lastLength); 311 | } else { 312 | System.arraycopy(currentValue, 0, read, position, blockSize); 313 | } 314 | } 315 | } 316 | } 317 | } 318 | 319 | public FDBArray snapshot() { 320 | return snapshot(System.currentTimeMillis()); 321 | } 322 | 323 | public FDBArray snapshot(long timestamp) { 324 | return new FDBArray(database, ds, timestamp); 325 | } 326 | 327 | public FDBArray snapshot(String name) { 328 | database.run(tx -> { 329 | tx.mutate(MutationType.ADD, dependents, ONE); 330 | return null; 331 | }); 332 | List childDirectory = asList("com.sampullara.fdb.array", name); 333 | DirectorySubspace childDs = get(DirectoryLayer.getDefault().create(database, childDirectory)); 334 | FDBArray.create(database, childDs, blockSize, ds, System.currentTimeMillis()); 335 | return new FDBArray(database, childDs); 336 | } 337 | 338 | public void clear() { 339 | database.run((Function) tx -> { 340 | tx.clear(data.pack()); 341 | usedBlocks.clear(tx); 342 | return null; 343 | }); 344 | } 345 | 346 | private void dependentDeleted() { 347 | database.run(tx -> { 348 | tx.mutate(MutationType.ADD, dependents, MINUS_ONE); 349 | return null; 350 | }); 351 | } 352 | 353 | public void delete() { 354 | boolean deletable = database.run(tx -> { 355 | byte[] bytes = get(tx.get(dependents)); 356 | return bytes == null || Longs.fromByteArray(bytes) == 0; 357 | }); 358 | if (deletable) { 359 | if (parentArray != null) parentArray.dependentDeleted(); 360 | get(ds.remove(database)); 361 | } else { 362 | throw new IllegalStateException("Array still has dependents"); 363 | } 364 | } 365 | 366 | public void setMetadata(byte[] key, byte[] value) { 367 | database.run(tx -> { 368 | tx.set(metadata.get(key).pack(), value); 369 | return null; 370 | }); 371 | } 372 | 373 | public byte[] getMetadata(byte[] key) { 374 | byte[] value = database.run(tx -> get(tx.get(metadata.get(key).pack()))); 375 | return value == null ? parentArray == null ? null : parentArray.getMetadata(key) : value; 376 | } 377 | } 378 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/FDBBitSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.apple.foundationdb.*; 21 | import com.apple.foundationdb.subspace.Subspace; 22 | import org.roaringbitmap.RoaringBitmap; 23 | import org.roaringbitmap.buffer.ImmutableRoaringBitmap; 24 | import org.roaringbitmap.buffer.MutableRoaringBitmap; 25 | 26 | import java.nio.ByteBuffer; 27 | import java.util.concurrent.CompletableFuture; 28 | 29 | public class FDBBitSet { 30 | private final Database database; 31 | private final Subspace subspace; 32 | private final int blockSize; 33 | private final byte[] allSetBytes; 34 | private final Range subspaceRange; 35 | 36 | protected FDBBitSet(Database database, Subspace subspace, int blockSize) { 37 | this.database = database; 38 | this.subspace = subspace; 39 | this.blockSize = blockSize; 40 | RoaringBitmap allSet = new RoaringBitmap(); 41 | allSet.add(0L, blockSize * 8L); 42 | ByteBuffer byteBuffer = ByteBuffer.allocate(allSet.serializedSizeInBytes()); 43 | allSet.serialize(byteBuffer); 44 | allSetBytes = byteBuffer.array(); 45 | subspaceRange = Range.startsWith(subspace.pack()); 46 | } 47 | 48 | public CompletableFuture set(long startBit, long endBit) { 49 | return database.runAsync(tx -> { 50 | FDBBitSet.this.set(tx, startBit, endBit); 51 | return CompletableFuture.completedFuture(null); 52 | }); 53 | } 54 | 55 | protected void set(Transaction tx, long startBit, long endBit) { 56 | // TODO: need to do something if the bitset is too big for an FDB value 57 | MutableRoaringBitmap bitSet = new MutableRoaringBitmap(); 58 | bitSet.add(startBit, endBit + 1); 59 | int capacity = bitSet.serializedSizeInBytes(); 60 | assert capacity <= 100_000; 61 | ByteBuffer byteBuffer = ByteBuffer.allocate(capacity); 62 | bitSet.serialize(byteBuffer); 63 | byte[] bytes = byteBuffer.array(); 64 | tx.set(subspace.pack(), bytes); 65 | } 66 | 67 | public CompletableFuture count() { 68 | return database.runAsync(tx -> { 69 | long count = 0; 70 | for (KeyValue keyValue : tx.getRange(subspaceRange)) { 71 | ByteBuffer byteBuffer = ByteBuffer.wrap(keyValue.getValue()); 72 | ImmutableRoaringBitmap bitSet = new ImmutableRoaringBitmap(byteBuffer); 73 | count += bitSet.getLongCardinality(); 74 | } 75 | return CompletableFuture.completedFuture(count); 76 | }); 77 | } 78 | 79 | public void clear() { 80 | database.run(tx -> { 81 | tx.clear(subspaceRange); 82 | return null; 83 | }); 84 | } 85 | 86 | public void clear(Transaction tx) { 87 | tx.clear(subspaceRange); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/FDBStorage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.apple.foundationdb.Database; 21 | import com.apple.foundationdb.FDB; 22 | import com.google.common.primitives.Longs; 23 | 24 | import java.util.Timer; 25 | import java.util.TimerTask; 26 | import java.util.concurrent.CompletableFuture; 27 | import java.util.concurrent.ExecutionException; 28 | import java.util.concurrent.ExecutorService; 29 | import java.util.concurrent.Executors; 30 | import java.util.concurrent.atomic.LongAdder; 31 | 32 | import static java.util.concurrent.TimeUnit.*; 33 | 34 | public class FDBStorage implements Storage { 35 | private static final long _30_SECONDS = MILLISECONDS.convert(30, SECONDS); 36 | private static final long _1_MINUTE = MILLISECONDS.convert(1, MINUTES); 37 | private static final byte[] ZERO = Longs.toByteArray(0); 38 | 39 | private static final Database db = FDB.selectAPIVersion(510).open(); 40 | private static final ExecutorService es = Executors.newFixedThreadPool(1, r -> new Thread(r, "fdbstorage-flush")); 41 | private static final Timer timer = new Timer("connection-leases"); 42 | 43 | private final FDBArray fdbArray; 44 | private final LongAdder writesStarted; 45 | private final LongAdder writesComplete; 46 | private final long size; 47 | private final String exportName; 48 | 49 | private TimerTask leaseTask; 50 | 51 | public FDBStorage(String exportName) { 52 | this.exportName = exportName; 53 | writesStarted = new LongAdder(); 54 | writesComplete = new LongAdder(); 55 | fdbArray = FDBArray.open(db, exportName); 56 | byte[] sizeBytes = fdbArray.getMetadata(NBD.SIZE_KEY); 57 | if (sizeBytes == null) { 58 | throw new IllegalArgumentException("Size of volume not configured"); 59 | } 60 | size = Longs.fromByteArray(sizeBytes); 61 | } 62 | 63 | @Override 64 | public synchronized void connect() { 65 | byte[] lease = fdbArray.getMetadata(NBD.LEASE_KEY); 66 | if (lease == null || (System.currentTimeMillis() - Longs.fromByteArray(lease) > _1_MINUTE)) { 67 | if (leaseTask != null) leaseTask.cancel(); 68 | leaseTask = new TimerTask() { 69 | @Override 70 | public void run() { 71 | fdbArray.setMetadata(NBD.LEASE_KEY, Longs.toByteArray(System.currentTimeMillis())); 72 | } 73 | }; 74 | timer.schedule(leaseTask, 0, _30_SECONDS); 75 | } else { 76 | throw new IllegalStateException("Volume " + exportName + " is already leased"); 77 | } 78 | } 79 | 80 | @Override 81 | public synchronized void disconnect() { 82 | if (leaseTask != null) { 83 | leaseTask.cancel(); 84 | fdbArray.setMetadata(NBD.LEASE_KEY, ZERO); 85 | } else { 86 | throw new IllegalStateException("Not connected to " + exportName); 87 | } 88 | } 89 | 90 | @Override 91 | public CompletableFuture read(byte[] buffer, long offset) { 92 | return fdbArray.read(buffer, offset); 93 | } 94 | 95 | @Override 96 | public CompletableFuture write(byte[] buffer, long offset) { 97 | writesStarted.increment(); 98 | return fdbArray.write(buffer, offset).thenRun(() -> { 99 | writesComplete.increment(); 100 | synchronized (writesComplete) { 101 | writesComplete.notifyAll(); 102 | } 103 | }); 104 | } 105 | 106 | @Override 107 | public CompletableFuture flush() { 108 | CompletableFuture result = new CompletableFuture<>(); 109 | es.submit(() -> { 110 | synchronized (writesComplete) { 111 | long target = writesStarted.longValue(); 112 | while (target > writesComplete.longValue()) { 113 | try { 114 | writesComplete.wait(); 115 | } catch (InterruptedException e) { 116 | // Ignore and continue looping 117 | } 118 | } 119 | } 120 | result.complete(null); 121 | }); 122 | return result; 123 | } 124 | 125 | @Override 126 | public long size() { 127 | return size; 128 | } 129 | 130 | @Override 131 | public long usage() { 132 | try { 133 | return fdbArray.usage().get(); 134 | } catch (ExecutionException | InterruptedException e) { 135 | throw new RuntimeException(e); 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/NBD.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.google.common.primitives.Ints; 21 | import com.google.common.primitives.Longs; 22 | 23 | public class NBD { 24 | public static final int NBD_OK = 0 ; /* OK */ 25 | public static final byte[] NBD_OK_BYTES = new byte[4]; 26 | 27 | public static final int NBD_FLAG_HAS_FLAGS = (1 << 0); /* Flags are there */ 28 | public static final int NBD_FLAG_READ_ONLY = (1 << 1); /* Device is read-only */ 29 | public static final int NBD_FLAG_SEND_FLUSH = (1 << 2); /* Send FLUSH */ 30 | public static final int NBD_FLAG_SEND_FUA = (1 << 3); /* Send FUA (Force Unit Access) */ 31 | public static final int NBD_FLAG_ROTATIONAL = (1 << 4); /* Use elevator algorithm - rotational media */ 32 | public static final int NBD_FLAG_SEND_TRIM = (1 << 5); /* Send TRIM (discard) */ 33 | 34 | public static final int NBD_REQUEST_MAGIC = 0x25609513; 35 | public static final byte[] NBD_REQUEST_MAGIC_BYTES = Ints.toByteArray(NBD_REQUEST_MAGIC); 36 | public static final int NBD_REPLY_MAGIC = 0x67446698; 37 | public static final byte[] NBD_REPLY_MAGIC_BYTES = Ints.toByteArray(NBD_REPLY_MAGIC); 38 | 39 | public static final byte[] INIT_PASSWD = "NBDMAGIC".getBytes(); 40 | 41 | public static final long CLISERV_MAGIC = 0x00420281861253L; 42 | public static final byte[] CLISERV_MAGIC_BYTES = Longs.toByteArray(CLISERV_MAGIC); 43 | public static final long OPTS_MAGIC = 0x49484156454F5054L; 44 | public static final byte[] OPTS_MAGIC_BYTES = Longs.toByteArray(OPTS_MAGIC); 45 | public static final long REP_MAGIC = 0x3e889045565a9L; 46 | public static final byte[] REP_MAGIC_BYTES = Longs.toByteArray(REP_MAGIC); 47 | public static final byte[] EMPTY_124 = new byte[124]; 48 | 49 | // FDB Keys 50 | public static final byte[] SIZE_KEY = "size".getBytes(); 51 | public static final byte[] LEASE_KEY = "lease".getBytes(); 52 | 53 | enum Command { 54 | READ, 55 | WRITE, 56 | DISCONNECT, 57 | FLUSH, 58 | TRIM, 59 | CACHE 60 | } 61 | 62 | public static final int NBD_OPT_EXPORT_NAME = 1; 63 | public static final int NBD_OPT_ABORT = 2; 64 | public static final int NBD_OPT_LIST = 3; 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/NBDServer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.google.common.base.Charsets; 21 | import com.sampullara.cli.Argument; 22 | 23 | import java.io.BufferedOutputStream; 24 | import java.io.DataInputStream; 25 | import java.io.DataOutputStream; 26 | import java.io.IOException; 27 | import java.net.InetSocketAddress; 28 | import java.net.ServerSocket; 29 | import java.net.Socket; 30 | import java.util.concurrent.ExecutorService; 31 | import java.util.concurrent.Executors; 32 | import java.util.logging.Level; 33 | import java.util.logging.Logger; 34 | 35 | import static nbdfdb.NBD.*; 36 | 37 | public class NBDServer { 38 | 39 | private static Logger log = Logger.getLogger("NBD"); 40 | 41 | @Argument(alias = "p", description = "The server port to listen on for connections") 42 | private static Integer port = 10809; 43 | 44 | public static void main(String[] args) throws IOException { 45 | ExecutorService es = Executors.newCachedThreadPool(); 46 | log.info("Listening for nbd-client connections"); 47 | ServerSocket ss = new ServerSocket(port); 48 | while (true) { 49 | Socket accept = ss.accept(); 50 | es.submit(() -> { 51 | try { 52 | InetSocketAddress remoteSocketAddress = (InetSocketAddress) accept.getRemoteSocketAddress(); 53 | log.info("Client connected from: " + remoteSocketAddress.getAddress().getHostAddress()); 54 | DataInputStream in = new DataInputStream(accept.getInputStream()); 55 | DataOutputStream out = new DataOutputStream(new BufferedOutputStream(accept.getOutputStream())); 56 | 57 | out.write(INIT_PASSWD); 58 | out.write(OPTS_MAGIC_BYTES); 59 | out.writeShort(NBD_FLAG_HAS_FLAGS); 60 | out.flush(); 61 | 62 | // TODO: interpret the client flags. 63 | int clientFlags = in.readInt(); 64 | long magic = in.readLong(); 65 | int opt = in.readInt(); 66 | if (opt != NBD_OPT_EXPORT_NAME) { 67 | throw new RuntimeException("We support only EXPORT options"); 68 | } 69 | int length = in.readInt(); 70 | byte[] bytes = new byte[length]; 71 | in.readFully(bytes); 72 | String exportName = new String(bytes, Charsets.UTF_8); 73 | log.info("Connecting client to " + exportName); 74 | NBDVolumeServer nbdVolumeServer = new NBDVolumeServer(exportName, in, out); 75 | log.info("Volume mounted"); 76 | nbdVolumeServer.run(); 77 | } catch (Throwable e) { 78 | log.log(Level.SEVERE, "Failed to connect", e); 79 | try { 80 | accept.close(); 81 | } catch (IOException e1) { 82 | e1.printStackTrace(); 83 | } 84 | } 85 | }); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/NBDVolumeServer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.google.common.primitives.UnsignedInteger; 21 | import com.google.common.primitives.UnsignedLong; 22 | import nbdfdb.NBD.*; 23 | 24 | import java.io.DataInputStream; 25 | import java.io.DataOutputStream; 26 | import java.io.IOException; 27 | import java.util.concurrent.CompletableFuture; 28 | import java.util.logging.Level; 29 | import java.util.logging.Logger; 30 | 31 | import static nbdfdb.NBD.*; 32 | 33 | /** 34 | * Created by sam on 11/9/14. 35 | */ 36 | public class NBDVolumeServer implements Runnable { 37 | 38 | private final Logger log; 39 | 40 | private final DataInputStream in; 41 | private final DataOutputStream out; 42 | private final String exportName; 43 | private final Storage storage; 44 | 45 | public NBDVolumeServer(String exportName, DataInputStream in, DataOutputStream out) throws IOException { 46 | this.exportName = exportName; 47 | log = Logger.getLogger("NDB: " + exportName); 48 | storage = new FDBStorage(exportName); 49 | log.info("Mounting " + exportName + " of size " + storage.size()); 50 | storage.connect(); 51 | this.in = in; 52 | this.out = out; 53 | } 54 | 55 | private void writeReplyHeaderAndFlush(long handle) throws IOException { 56 | synchronized (out) { 57 | out.write(NBD_REPLY_MAGIC_BYTES); 58 | out.write(NBD_OK_BYTES); 59 | out.writeLong(handle); 60 | out.flush(); 61 | } 62 | } 63 | 64 | @Override 65 | public void run() { 66 | try { 67 | out.writeLong(storage.size()); 68 | out.writeShort(NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH); 69 | out.write(EMPTY_124); 70 | out.flush(); 71 | 72 | while (true) { 73 | int requestMagic = in.readInt();// MAGIC 74 | if (requestMagic != NBD_REQUEST_MAGIC) { 75 | throw new IllegalArgumentException("Invalid magic number for request: " + requestMagic); 76 | } 77 | Command requestType = Command.values()[in.readInt()]; 78 | long handle = in.readLong(); 79 | UnsignedLong offset = UnsignedLong.fromLongBits(in.readLong()); 80 | UnsignedInteger requestLength = UnsignedInteger.fromIntBits(in.readInt()); 81 | if (requestLength.longValue() > Integer.MAX_VALUE) { 82 | // We could ultimately support this but it isn't common by any means 83 | throw new IllegalArgumentException("Failed to read, length too long: " + requestLength); 84 | } 85 | switch (requestType) { 86 | case READ: { 87 | byte[] buffer = new byte[requestLength.intValue()]; 88 | log.info("Reading " + buffer.length + " from " + offset); 89 | storage.read(buffer, offset.intValue()).thenApply($ -> { 90 | synchronized (out) { 91 | try { 92 | out.write(NBD_REPLY_MAGIC_BYTES); 93 | out.write(NBD_OK_BYTES); 94 | out.writeLong(handle); 95 | out.write(buffer); 96 | out.flush(); 97 | } catch (IOException e) { 98 | throw new RuntimeException(e); 99 | } 100 | } 101 | return null; 102 | }); 103 | break; 104 | } 105 | case WRITE: { 106 | byte[] buffer = new byte[requestLength.intValue()]; 107 | in.readFully(buffer); 108 | log.info("Writing " + buffer.length + " to " + offset); 109 | storage.write(buffer, offset.intValue()).thenApply($ -> { 110 | try { 111 | writeReplyHeaderAndFlush(handle); 112 | } catch (IOException e) { 113 | throw new RuntimeException(e); 114 | } 115 | return null; 116 | }); 117 | break; 118 | } 119 | case DISCONNECT: 120 | log.info("Disconnecting " + exportName); 121 | storage.disconnect(); 122 | return; 123 | case FLUSH: 124 | log.info("Flushing"); 125 | long start = System.currentTimeMillis(); 126 | storage.flush().thenApply($ -> { 127 | try { 128 | writeReplyHeaderAndFlush(handle); 129 | } catch (IOException e) { 130 | throw new RuntimeException(e); 131 | } 132 | log.info("Flush complete: " + (System.currentTimeMillis() - start) + "ms"); 133 | return null; 134 | }); 135 | break; 136 | case TRIM: 137 | log.warning("Trim unimplemented"); 138 | writeReplyHeaderAndFlush(handle); 139 | break; 140 | case CACHE: 141 | log.warning("Cache unimplemented"); 142 | break; 143 | } 144 | } 145 | } catch (Exception e) { 146 | log.log(Level.SEVERE, "Unmounting volume " + exportName, e); 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/Storage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import java.util.concurrent.CompletableFuture; 21 | 22 | /** 23 | * Created by sam on 11/11/14. 24 | */ 25 | public interface Storage { 26 | void connect(); 27 | 28 | void disconnect(); 29 | 30 | CompletableFuture read(byte[] buffer, long offset); 31 | 32 | CompletableFuture write(byte[] buffer, long offset); 33 | 34 | CompletableFuture flush(); 35 | 36 | long size(); 37 | 38 | long usage(); 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/CreateCommand.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import com.apple.foundationdb.FDB; 21 | import com.google.common.primitives.Longs; 22 | import com.sampullara.cli.Argument; 23 | import nbdfdb.FDBArray; 24 | 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | import static nbdfdb.NBD.SIZE_KEY; 29 | 30 | public class CreateCommand implements Runnable { 31 | FDB fdb = FDB.selectAPIVersion(510); 32 | 33 | @Argument(alias = "n", description = "Name of the volume to create", required = true) 34 | private String exportName; 35 | 36 | @Argument(alias = "s", description = "Size in bytes of the volume, can use K, M, G, or T units.", required = true) 37 | private String size; 38 | 39 | @Argument(alias = "b", description = "Block size of the volume") 40 | private Integer blockSize = 512; 41 | 42 | enum Unit { 43 | K(1_000L), 44 | M(1_000_000L), 45 | G(1_000_000_000L), 46 | T(1_000_000_000_000L),; 47 | 48 | final long factor; 49 | 50 | Unit(long factor) { 51 | this.factor = factor; 52 | } 53 | } 54 | 55 | @Override 56 | public void run() { 57 | Pattern pattern = Pattern.compile("([0-9]+)(k|K|m|M|g|G|t|T)?(b|B)?"); 58 | Matcher matcher = pattern.matcher(size); 59 | if (matcher.matches()) { 60 | long value = Long.parseLong(matcher.group(1)); 61 | String unitGroup = matcher.group(2); 62 | if (unitGroup != null) { 63 | Unit unit = Unit.valueOf(unitGroup.toUpperCase()); 64 | value *= unit.factor; 65 | } 66 | FDBArray fdbArray = FDBArray.create(fdb.open(), exportName, blockSize); 67 | fdbArray.setMetadata(SIZE_KEY, Longs.toByteArray(value)); 68 | System.out.println("Successfully created '" + exportName + "'"); 69 | } else { 70 | System.err.println("Invalid size: " + size); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/DeleteCommand.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import com.apple.foundationdb.FDB; 21 | import com.sampullara.cli.Argument; 22 | import nbdfdb.FDBArray; 23 | 24 | /** 25 | * Created by sam on 11/9/14. 26 | */ 27 | public class DeleteCommand implements Runnable { 28 | 29 | FDB fdb = FDB.selectAPIVersion(510); 30 | 31 | @Argument(alias = "n", description = "Name of the volume to delete", required = true) 32 | private String exportName; 33 | 34 | @Override 35 | public void run() { 36 | FDBArray fdbArray = FDBArray.open(fdb.open(), exportName); 37 | fdbArray.delete(); 38 | System.out.println("Deleted volume " + exportName); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/ListCommand.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import com.apple.foundationdb.Database; 21 | import com.apple.foundationdb.FDB; 22 | import com.apple.foundationdb.directory.DirectoryLayer; 23 | import nbdfdb.FDBStorage; 24 | 25 | import java.util.Arrays; 26 | import java.util.Collections; 27 | import java.util.List; 28 | import java.util.concurrent.ExecutionException; 29 | 30 | import static java.util.Collections.singletonList; 31 | 32 | public class ListCommand implements Runnable { 33 | FDB fdb = FDB.selectAPIVersion(510); 34 | 35 | @Override 36 | public void run() { 37 | Database db = fdb.open(); 38 | List exportNames = null; 39 | try { 40 | exportNames = DirectoryLayer.getDefault().list(db, singletonList("com.sampullara.fdb.array")).get(); 41 | } catch (ExecutionException | InterruptedException e) { 42 | throw new RuntimeException(e); 43 | } 44 | for (String exportName : exportNames) { 45 | FDBStorage fdbStorage = new FDBStorage(exportName); 46 | System.out.printf("%s: %d/%d %2.1f\n", exportName, fdbStorage.usage(), fdbStorage.size(), (double)fdbStorage.usage()/fdbStorage.size()*100); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/NBDCLI.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import com.sampullara.cli.Args; 21 | 22 | public class NBDCLI { 23 | 24 | enum CommandType { 25 | CREATE(new CreateCommand()), 26 | DELETE(new DeleteCommand()), 27 | SNAPSHOT(new SnapshotCommand()), 28 | LIST(new ListCommand()), 29 | SERVER(new ServerCommand()), 30 | ; 31 | private final Runnable command; 32 | 33 | CommandType(Runnable command) { 34 | this.command = command; 35 | } 36 | 37 | void run(String[] args) { 38 | Args.parseOrExit(command, args); 39 | command.run(); 40 | } 41 | } 42 | 43 | public static void main(String[] args) { 44 | try { 45 | String arg = args[0]; 46 | CommandType commandType = CommandType.valueOf(arg.toUpperCase()); 47 | commandType.run(args); 48 | System.exit(0); 49 | } catch (IllegalArgumentException | ArrayIndexOutOfBoundsException e) { 50 | e.printStackTrace(); 51 | System.err.println("Usage: nbdfdb.cli.NBDCLI [command name]"); 52 | for (CommandType commandType : CommandType.values()) { 53 | System.err.print(commandType.name().toLowerCase() + ": "); 54 | Args.usage(commandType.command); 55 | } 56 | System.exit(1); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/ServerCommand.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import nbdfdb.NBDServer; 21 | 22 | import java.io.IOException; 23 | 24 | public class ServerCommand implements Runnable { 25 | @Override 26 | public void run() { 27 | try { 28 | NBDServer.main(new String[0]); 29 | } catch (IOException e) { 30 | e.printStackTrace(); 31 | System.exit(1); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/nbdfdb/cli/SnapshotCommand.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb.cli; 19 | 20 | import com.apple.foundationdb.Database; 21 | import com.apple.foundationdb.FDB; 22 | import com.sampullara.cli.Argument; 23 | import nbdfdb.FDBArray; 24 | 25 | public class SnapshotCommand implements Runnable { 26 | FDB fdb = FDB.selectAPIVersion(510); 27 | 28 | @Argument(alias = "n", description = "Name of the snapshot volume to create", required = true) 29 | private String exportName; 30 | 31 | @Argument(alias = "v", description = "Name of the volume to snapshot", required = true) 32 | private String volumeName; 33 | 34 | @Override 35 | public void run() { 36 | Database db = fdb.open(); 37 | FDBArray volume = FDBArray.open(db, volumeName); 38 | volume.snapshot(exportName); 39 | System.out.println("Successfully snapshotted " + volumeName + " as " + exportName); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/nbdfdb/FDBArrayTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Sam Pullara 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package nbdfdb; 19 | 20 | import com.apple.foundationdb.Database; 21 | import com.apple.foundationdb.FDB; 22 | import org.HdrHistogram.Histogram; 23 | import org.junit.*; 24 | 25 | import java.util.ArrayList; 26 | import java.util.Arrays; 27 | import java.util.List; 28 | import java.util.Random; 29 | import java.util.concurrent.ExecutionException; 30 | import java.util.concurrent.Semaphore; 31 | 32 | import static org.junit.Assert.*; 33 | 34 | public class FDBArrayTest { 35 | 36 | private static FDBArray fdbArray; 37 | 38 | @BeforeClass 39 | public static void setup() { 40 | FDB fdb = FDB.selectAPIVersion(510); 41 | Database db = fdb.open(); 42 | FDBArray.create(db, "testArray", 512); 43 | fdbArray = FDBArray.open(db, "testArray"); 44 | } 45 | 46 | @AfterClass 47 | public static void cleanup() { 48 | fdbArray.delete(); 49 | } 50 | 51 | @After 52 | @Before 53 | public void delete() { 54 | fdbArray.clear(); 55 | } 56 | 57 | @Test 58 | public void testSimpleReadWrite() throws ExecutionException, InterruptedException { 59 | byte[] bytes = new byte[12345]; 60 | Arrays.fill(bytes, (byte) 1); 61 | fdbArray.write(bytes, 10000).get(); 62 | byte[] read = new byte[12345]; 63 | fdbArray.read(read, 10000).get(); 64 | assertArrayEquals(bytes, read); 65 | assertEquals((12345 / 512 + 1) * 512, fdbArray.usage().get().longValue()); 66 | } 67 | 68 | @Test 69 | public void testReadOnly() throws ExecutionException, InterruptedException { 70 | byte[] bytes = new byte[12345]; 71 | Arrays.fill(bytes, (byte) 1); 72 | fdbArray.write(bytes, 10000).get(); 73 | FDBArray snapshot = fdbArray.snapshot(); 74 | 75 | byte[] read = new byte[12345]; 76 | snapshot.read(read, 10000).get(); 77 | assertArrayEquals(bytes, read); 78 | 79 | try { 80 | snapshot.write(bytes, 10000).get(); 81 | fail("Should be read only"); 82 | } catch (IllegalStateException ise) { 83 | // Read only 84 | } 85 | } 86 | 87 | @Test 88 | public void testSnapshots() throws InterruptedException, ExecutionException { 89 | Random r = new Random(1337); 90 | byte[] bytes = new byte[12345]; 91 | r.nextBytes(bytes); 92 | fdbArray.write(bytes, 10000).get(); 93 | byte[] read = new byte[12345]; 94 | fdbArray.read(read, 10000).get(); 95 | assertArrayEquals(bytes, read); 96 | long timestamp = System.currentTimeMillis(); 97 | Thread.sleep(10); 98 | byte[] nextBytes = new byte[12345]; 99 | r.nextBytes(nextBytes); 100 | fdbArray.write(nextBytes, 10000).get(); 101 | fdbArray.read(read, 10000); 102 | assertArrayEquals(nextBytes, read); 103 | fdbArray.read(read, 10000, timestamp); 104 | assertArrayEquals(bytes, read); 105 | 106 | byte[] empty = new byte[12345]; 107 | byte[] readEmpty = new byte[12345]; 108 | fdbArray.read(readEmpty, 10000, 0).get(); 109 | assertArrayEquals(readEmpty, empty); 110 | } 111 | 112 | @Test 113 | public void testParent() throws ExecutionException, InterruptedException { 114 | Random r = new Random(1337); 115 | byte[] parentBytes = new byte[2000]; 116 | r.nextBytes(parentBytes); 117 | fdbArray.write(parentBytes, 1000).get(); 118 | byte[] parentRead = new byte[2000]; 119 | fdbArray.read(parentRead, 1000).get(); 120 | assertArrayEquals(parentBytes, parentRead); 121 | 122 | // Should start with a snapshot of the parent, need to delete first for testing 123 | FDBArray fdbChildArray = fdbArray.snapshot("testChildArray"); 124 | try { 125 | byte[] childRead = new byte[2000]; 126 | fdbChildArray.read(childRead, 1000).get(); 127 | assertArrayEquals(parentBytes, childRead); 128 | 129 | byte[] childBytes = new byte[1000]; 130 | r.nextBytes(childBytes); 131 | fdbChildArray.write(childBytes, 1500).get(); 132 | 133 | byte[] mixedRead = new byte[2000]; 134 | fdbChildArray.read(mixedRead, 1000).get(); 135 | 136 | for (int i = 0; i < 500; i++) { 137 | assertEquals("Failed: " + i, parentBytes[i], mixedRead[i]); 138 | } 139 | for (int i = 500; i < 1500; i++) { 140 | assertEquals("Failed: " + i, childBytes[i - 500], mixedRead[i]); 141 | } 142 | for (int i = 1500; i < 2000; i++) { 143 | assertEquals("Failed: " + i, parentBytes[i], mixedRead[i]); 144 | } 145 | } finally { 146 | fdbChildArray.delete(); 147 | } 148 | } 149 | 150 | @Test 151 | @Ignore 152 | public void testRandomReadWrite() throws ExecutionException, InterruptedException { 153 | Random r = new Random(1337); 154 | for (int i = 0; i < 1000; i++) { 155 | int length = r.nextInt(10000); 156 | byte[] bytes = new byte[length]; 157 | r.nextBytes(bytes); 158 | int offset = r.nextInt(100000); 159 | fdbArray.write(bytes, offset).get(); 160 | byte[] read = new byte[length]; 161 | fdbArray.read(read, offset).get(); 162 | assertArrayEquals("Iteration: " + i + ", " + length + ", " + offset, bytes, read); 163 | } 164 | assertEquals((110000 / 512 + 1) * 512, fdbArray.usage().get().longValue()); 165 | } 166 | 167 | @Test 168 | @Ignore 169 | public void testRandomReadWriteBenchmark() throws ExecutionException, InterruptedException { 170 | List arrays = new ArrayList<>(); 171 | FDBArray fdbArray = FDBArrayTest.fdbArray; 172 | try { 173 | for (int j = 0; j < 3; j++) { 174 | Histogram readLatencies = new Histogram(10000000000l, 5); 175 | Histogram writeLatencies = new Histogram(10000000000l, 5); 176 | Random r = new Random(1337); 177 | Semaphore semaphore = new Semaphore(100); 178 | int TOTAL = 10000; 179 | for (int i = 0; i < TOTAL; i++) { 180 | { 181 | int length = r.nextInt(10000); 182 | byte[] bytes = new byte[length]; 183 | r.nextBytes(bytes); 184 | int offset = r.nextInt(100000000); 185 | semaphore.acquireUninterruptibly(); 186 | long startWrite = System.nanoTime(); 187 | fdbArray.write(bytes, offset).thenRun(() -> { 188 | semaphore.release(); 189 | long writeLatency = System.nanoTime() - startWrite; 190 | writeLatencies.recordValue(writeLatency); 191 | }); 192 | }; 193 | { 194 | int length = r.nextInt(10000); 195 | int offset = r.nextInt(100000000); 196 | byte[] read = new byte[length]; 197 | semaphore.acquireUninterruptibly(); 198 | long startRead = System.nanoTime(); 199 | fdbArray.read(read, offset).thenRun(() -> { 200 | semaphore.release(); 201 | long readLatency = System.nanoTime() - startRead; 202 | readLatencies.recordValue(readLatency); 203 | }); 204 | }; 205 | } 206 | semaphore.acquireUninterruptibly(100); 207 | percentiles("Writes", writeLatencies); 208 | percentiles("Reads", readLatencies); 209 | fdbArray = fdbArray.snapshot("test" + j); 210 | arrays.add(0, fdbArray); 211 | } 212 | } finally { 213 | arrays.forEach((array) -> { 214 | try { 215 | System.out.println("Usage: " + array.usage().get()); 216 | } catch (ExecutionException | InterruptedException e) { 217 | e.printStackTrace(); 218 | } 219 | array.delete(); 220 | }); 221 | System.out.println("Usage: " + FDBArrayTest.fdbArray.usage().get()); 222 | } 223 | } 224 | 225 | private void percentiles(final String title, Histogram h) { 226 | System.out.println(title + ": " + 227 | " Mean: " + h.getMean()/1e6 + 228 | " p50: " + h.getValueAtPercentile(50)/1e6 + 229 | " p95: " + h.getValueAtPercentile(95)/1e6 + 230 | " p99: " + h.getValueAtPercentile(99)/1e6 + 231 | " p999: " + h.getValueAtPercentile(999)/1e6 232 | ); 233 | } 234 | } 235 | --------------------------------------------------------------------------------