├── .gitignore ├── .travis.yml ├── Dockerfile ├── Documentation ├── admin-guide.md ├── architecture.md ├── benchmark │ ├── 2016_08_02_torus_iozone_nbd_http.txt │ └── torus_bench_log.md ├── failure-guide.md ├── files.md ├── getting-started.md ├── glossary.md ├── monitoring.md ├── project-layout.md ├── research-links.md ├── snapshotting.md └── torus-overview.png ├── LICENSE ├── MAINTAINERS ├── Makefile ├── Procfile ├── README.md ├── block ├── aoe │ ├── aoe.go │ ├── aoe_test.go │ ├── device.go │ ├── frame.go │ ├── interface.go │ └── interface_test.go ├── blockfile.go ├── doc.go ├── etcd.go ├── gc_block_vol.go ├── metadata.go ├── temp.go └── volume.go ├── blockset.go ├── blockset ├── base.go ├── base_test.go ├── blockset_main.go ├── crc.go ├── crc_test.go └── replication.go ├── cliconfig └── config.go ├── cmd ├── ringtool │ └── ringtool.go ├── torusblk │ ├── aoe.go │ ├── completion.go │ ├── flex.go │ ├── main.go │ ├── nbd.go │ └── tcmu.go ├── torusctl │ ├── block.go │ ├── block_dump_load.go │ ├── block_snapshot.go │ ├── common.go │ ├── completion.go │ ├── config.go │ ├── init.go │ ├── list-peers.go │ ├── peer.go │ ├── ring.go │ ├── table.go │ ├── torusctl.go │ ├── volume.go │ └── wipe.go └── torusd │ └── main.go ├── config.go ├── contrib ├── grafana │ └── grafana.json ├── kubernetes │ ├── README.md │ ├── postgres-oneshot.yaml │ ├── test-data.sql │ └── torus-k8s-oneshot.yaml └── systemd │ ├── README.md │ ├── torusd │ └── torusd.service ├── dev-internal └── release.md ├── distributor ├── client.go ├── distributor.go ├── distributor_test.go ├── lru.go ├── monitoring.go ├── protocols │ ├── grpc │ │ └── grpc.go │ ├── protocols.go │ └── tdp │ │ ├── bitset.go │ │ ├── bitset_test.go │ │ ├── client.go │ │ ├── server.go │ │ ├── tdp.go │ │ └── tdp_test.go ├── rebalance.go ├── rebalance │ ├── rebalancer.go │ └── tick.go ├── replication.go ├── rpc.go └── storage.go ├── entrypoint.sh ├── errors.go ├── file.go ├── file_blackbox_test.go ├── file_cache.go ├── gc ├── gc.go └── null.go ├── glide.lock ├── glide.yaml ├── heartbeat.go ├── inode.go ├── integration └── integration_test.go ├── internal ├── flagconfig │ └── flagconfig.go ├── http │ └── api.go ├── nbd │ ├── LICENSE │ ├── README.md │ ├── nbd.go │ └── nbdserver.go └── tcmu │ ├── commands.go │ └── connect.go ├── local_server.go ├── metadata.go ├── metadata ├── common.go ├── etcd │ ├── debug.go │ ├── etcd.go │ ├── global_funcs.go │ ├── helpers.go │ └── ring_watch.go └── temp │ └── temp.go ├── models ├── doc.go ├── extensions.go ├── generate.sh ├── rpc.pb.go ├── rpc.proto ├── rpcpb_test.go ├── torus.pb.go ├── torus.proto └── toruspb_test.go ├── ring.go ├── ring ├── empty.go ├── ketama.go ├── ketama_test.go ├── mod.go ├── ring_main.go ├── single.go └── union.go ├── server.go ├── storage.go ├── storage ├── common.go ├── mfile.go ├── mmap_file.go ├── mmap_file_test.go └── temp.go └── version.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | local 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | 27 | 28 | *.swp 29 | *.swo 30 | 31 | # Binaries 32 | bin 33 | tools 34 | 35 | # Data directory 36 | /torus-data 37 | 38 | # Since torus can be used as a package, don't commit the vendor directory. 39 | # For background see: https://groups.google.com/forum/#!topic/golang-dev/4FfTBfN2YaI 40 | vendor 41 | 42 | # local-cluster information 43 | /local-cluster 44 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | sudo: false 4 | 5 | go: 6 | - 1.6 7 | - tip 8 | 9 | install: 10 | - make vendor 11 | 12 | script: 13 | - make fmt 14 | - make vet 15 | - make test 16 | - make build 17 | 18 | matrix: 19 | allow_failures: 20 | - go: tip 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:latest 2 | MAINTAINER Barak Michener 3 | 4 | # Set up workdir 5 | WORKDIR /go/src/github.com/coreos/torus 6 | 7 | # Add and install torus 8 | ADD . . 9 | RUN make vendor 10 | RUN go install -v github.com/coreos/torus/cmd/torusd 11 | RUN go install -v github.com/coreos/torus/cmd/torusctl 12 | RUN go install -v github.com/coreos/torus/cmd/torusblk 13 | 14 | # Expose the port and volume for configuration and data persistence. 15 | VOLUME ["/data", "/plugin"] 16 | EXPOSE 40000 4321 17 | 18 | CMD ["./entrypoint.sh"] 19 | -------------------------------------------------------------------------------- /Documentation/admin-guide.md: -------------------------------------------------------------------------------- 1 | # Torus Admin Guide 2 | 3 | ## I want to... 4 | 5 | ### Set up Torus 6 | 7 | #### Set up Torus directly 8 | 9 | See the root README.md for a pretty good overview. 10 | 11 | #### Set up Torus on a new Kubernetes cluster 12 | 13 | See contrib/kubernetes/README.md 14 | 15 | #### Set up the Torus FlexVolume Plugin on an existing Kubernetes cluster 16 | 17 | The default path for installing flexvolume plugins is `/usr/libexec/kubernetes/kubelet-plugins/volume/exec/` -- so on every node running the kubelet, you'll need to create the subfolder: 18 | 19 | ``` 20 | mkdir -p /usr/libexec/kubernetes/kubelet-plugins/volume/exec/coreos.com~torus/ 21 | ``` 22 | 23 | The `torusblk` binary itself conforms as to the flexVolume api, so you'll want to copy it, named `torus`, inside that directory (as per the [Kubernetes repo](https://github.com/kubernetes/kubernetes/tree/master/examples/flexvolume)): 24 | 25 | ``` 26 | cp ./torusblk /usr/libexec/kubernetes/kubelet-plugins/volume/exec/coreos.com~torus/torus 27 | ``` 28 | 29 | And restart the kubelet so that it registers the new plugin, eg (on systemd systems): 30 | 31 | ``` 32 | systemctl restart kubelet 33 | ``` 34 | 35 | ### Use Block Volumes 36 | 37 | All the following commands take an optional `-C HOST:PORT` for your etcd endpoint, if it's not localhost. 38 | 39 | #### List all volumes 40 | 41 | ``` 42 | torusctl volume list 43 | ``` 44 | 45 | #### Provision a new block volume 46 | 47 | ``` 48 | torusctl volume create-block VOLUME_NAME SIZE 49 | ``` 50 | or equivalently 51 | ``` 52 | torusctl block create VOLUME_NAME SIZE 53 | ``` 54 | 55 | Where VOLUME_NAME is whatever you prefer, as long as there's not already one named the same. 56 | 57 | SIZE is given in bytes, and supports human-readable suffixes: M,G,T,MiB,GiB,TiB; so for a 1 gibibyte drive, you can use `1GiB`. 58 | 59 | #### Delete a block volume 60 | 61 | ``` 62 | torusctl volume delete VOLUME_NAME 63 | ``` 64 | 65 | #### Attach a block volume 66 | 67 | `` 68 | torusblk nbd VOLUME_NAME [NBD_DEVICE] 69 | `` 70 | 71 | NBD_DEVICE is optional. Other options for serving or attaching a block device may appear here in the future. 72 | 73 | `torusblk nbd` will block until it recieves a signal, which will disconnect the volume from the device. It's recommended to run this under an init process if you wish to detach it from your terminal. 74 | 75 | #### Mount/format a block volume 76 | 77 | Once attached to a device (which is reported when `torusblk nbd` starts), it works like any block device; so standard tools like `mkfs` and `mount` will work. 78 | 79 | ### Modify my cluster 80 | 81 | Again, all the following commands take an optional `-C HOST:PORT` for your etcd endpoint, if it's not localhost. 82 | 83 | #### Add a storage node 84 | 85 | *Let the storage node add itself* 86 | 87 | The `--auto-join` flag is there for this reason. When we start the node with it, eg: 88 | 89 | ``` 90 | ./torusd --etcd 127.0.0.1:2379 --peer-address http://$MY_IP:40000 --data-dir /path/to/data --size 20GiB --auto-join 91 | ``` 92 | 93 | it will join the cluster and data will start rebalancing onto this new node. 94 | 95 | *Manually add a storage node* 96 | 97 | If there's an available node that is not part of the storage set, it will appear as "Avail" in `torusctl peer list`. It can be added by: 98 | 99 | ``` 100 | torusctl peer add ADDRESS_OF_NODE 101 | ``` 102 | 103 | or 104 | 105 | ``` 106 | torusctl peer add UUID_OF_NODE 107 | ``` 108 | 109 | #### Remove a storage node 110 | 111 | Removing is as easy as adding a node: 112 | 113 | ``` 114 | torusctl peer remove ADDRESS_OF_NODE 115 | ``` 116 | 117 | or 118 | 119 | ``` 120 | torusctl peer remove UUID_OF_NODE 121 | ``` 122 | 123 | Data will immediately start migrating off the node, or replicating from other sources if the node is completely lost. 124 | 125 | #### Change replication 126 | 127 | ``` 128 | torusctl ring set-replication AMOUNT 129 | ``` 130 | 131 | Where amount is the number of machines expected to hold a copy of any block. `2` is default. 132 | 133 | #### Manually edit my hash ring 134 | 135 | **ADVANCED**: Do not attempt unless you're sure of what you're doing. If you're doing this often, there's probably some better tooling that needs to be created that's worth filing a bug about. 136 | 137 | ``` 138 | torusctl ring manual-change --help 139 | ``` 140 | 141 | Will show the options. 142 | * `--type` will change the type of ring 143 | * `--replication` sets the replication factor 144 | * `--uuids` is a comma-separated list of the UUIDs with associated data dirs. 145 | 146 | Join us in IRC if you'd like to chat about ring design. 147 | -------------------------------------------------------------------------------- /Documentation/architecture.md: -------------------------------------------------------------------------------- 1 | ### Running processes 2 | 3 | * **Metadata Service (MDS)** 4 | * A consistent store and distributed lockserver. (etcd) 5 | * **Data Nodes** 6 | * Provide actual storage (torusd) 7 | * **Client Nodes** 8 | * Connect to data nodes and the MDS to provide access to the storage (eg, torusblk) 9 | 10 | ### High-Level Data Model 11 | 12 | * Storage blocks (of size 512K by default) 13 | * Your data, split into chunks. 14 | * BlockRefs 15 | * 192bit identities assigned to a block when it is written at an INode Version Index. Consists of: 16 | * 24 bits Reference Type (Data, INode, ECC, ...) 17 | * 40 bits Volume ID 18 | * 64 bits INode Index 19 | * 64 bits per-index ID 20 | * INode 21 | * A list of blocks written within one index timeframe. 22 | * A sync() closes the list, and makes sure the blocks are written. 23 | * See [files](./files.md) for more details. 24 | * Blocklayer/Blockset 25 | * Extra data, per block inside an INode, that appears as a list of blocks to callers, but offer a hook for more features. 26 | * Eg: 27 | ``` 28 | | Block 1 | Block 2 | Block 3 | Extra Data 29 | ----------------------------------------------------------- 30 | CRC Checksums| 2340AE529E | 6432CRE32D | 9CF29F9347 | ..... 31 | Reed-Solomon | A | A | A | [A: BlockRef 20] 32 | Base | BlockRef 1 | BlockRef 2 | BlockRef 3 | ..... 33 | ``` 34 | * Rings 35 | * Pure functions that take a BlockRef and return a permutation of the nodes being used for storage 36 | * Server 37 | * Connects to the MDS and underlying storage, acts as an abstraction between the moving parts. 38 | * File 39 | * Abstraction over INodes, that support ReadAt(), WriteAt() and Sync() 40 | 41 | ### Rings, hashing, and recovery 42 | 43 | Basically, this looks like a pretty standard [DHT](https://en.wikipedia.org/wiki/Distributed_hash_table) 44 | -------------------------------------------------------------------------------- /Documentation/failure-guide.md: -------------------------------------------------------------------------------- 1 | # What to expect under failure conditions 2 | 3 | ## Single Machine Failures 4 | 5 | By default, Torus replicates all data blocks twice. 6 | 7 | ### Is the replication factor greater than the number of downed nodes?* 8 | 9 | #### Yes 10 | 11 | *Yes, and these nodes are temporarily down* 12 | 13 | No need to panic. At worst, reads and writes will have a somewhat higher latency while the nodes are down. When they come back up, they will catch up and the cluster will proceed as normal. 14 | 15 | *Yes, and these nodes are never coming back* 16 | 17 | To prevent loss, remove these nodes from the ring: 18 | 19 | ``` 20 | torusctl peer remove UUID-OF-DOWN-NODE 21 | ``` 22 | 23 | You can retrieve that UUID with `torusctl peer list` 24 | 25 | Once removed, the cluster will automatically rereplicate and rebalance the data on the live nodes. There will be a slight latency penalty while this process takes place. 26 | 27 | #### No 28 | 29 | Reads may begin to fail on connected clients, and you may see I/O errors. Writes will be sent to the remaining nodes. If there are no other failures, if the nodes come back, they will catch up and the data will be secure. If they are lost forever, you may experience data loss. 30 | 31 | A future extension may allow peers to optimistically rebalance data when the first nodes stop responding. At the cost of extra bandwidth usage, it can prevent outages. 32 | 33 | ## Network partition between peers 34 | 35 | If sufficient nodes are on the wrong side of the partition, reads may begin to fail, and in-flight writes will sync, but will stop being accepted. 36 | 37 | ## Network partition between client and etcd 38 | 39 | The client will fail to sync and begin reporting I/O errors; this is non-fatal, as the previous sync and related data will remain intact. When the partition is repaired, clients can restart from the checkpoint before the partition and continue; only data written during this timeframe will be lost. In the future, this need not be the case; a client could continue to work until the repair happens, and a sanity check could detect this scenario, saving even the data that was written during the partition. 40 | -------------------------------------------------------------------------------- /Documentation/files.md: -------------------------------------------------------------------------------- 1 | An inode is the unit of the append-only log for the DFS. Inode numbers are ever-increasing. If inode IDs are 64-bit integers, we have enough writes to last us till doomsday (one inode sync() per millisecond == 584.5 million years) 2 | 3 | A file is merely the 'head' version of its inode, hence the revision being the atomic unit. An inode for one file may invalidate previous inodes without requiring they be fully deleted immediately. This allows for locks and simpler concurrency (In the case of multiple writers: I may open a file at inode revision 104, it may be replaced by 105, but I can keep serving 104 to my client until it updates). 4 | In terms of how syncs get resolved, last write wins, as per fairly usual UNIX semantics. (run `echo "foo" > foo` simultaneously and see what happens). More intricate interactions (open a file for append-only, `flock()`, et al) are possible with this model but not on the immediate roadmap. 5 | 6 | An inode consists of the following data: 7 | 8 | * Replaced inode number 9 | * File Blockset 10 | 11 | Further abstractions can be built around this, say a POSIX file: 12 | 13 | * INode 14 | * Filenames that refer to it (hard links) 15 | * Metadata 16 | * Owner, Group 17 | * Permissions 18 | * Extended Attrs 19 | 20 | Inodes are, themselves, serialized and stored as blocks in the volume. A special bit is set to identify them as inodes, and to separate their ID space from the blocks of data they represent. To wit: 21 | 22 | * Type(data) Volume 1, Inode Index 2, Index 1 (first block of data written at index 2) 23 | * Type(inode) Volume 1, Inode Index 2, Index 1 (first block of inode serialization written at index 2) 24 | 25 | At this point, the actual data blocks form a traditional sharded KV store. We reconstruct the file by asking for the appropriate keys of the appropriate block ranges (blocks are a fixed size). Replication is handled through successive members in our hash function (which we have opportunity to define). 26 | 27 | An inode can be committed on explicit fsync() -- note they have no notion of what number they represent. Therefore, multiple writes can happen to a local version of the inode (a "staging" inode) before committing it to the greater cluster. 28 | 29 | The ID list need not be contiguous. Deleted blocks may exist in the logical keyspace, but be unreferred to in the latest version of the inode. The simplest GC looks at the current inode and older (but not newer) and deletes blocks that aren't referenced. 30 | -------------------------------------------------------------------------------- /Documentation/glossary.md: -------------------------------------------------------------------------------- 1 | ### Peer 2 | A server implementation that stores data. 3 | 4 | ### [Direct] Client 5 | A server-like implementation that doesn't store, but talks on the internal port to Peers. 6 | 7 | ### Proxy Client 8 | A consumer of the HTTP endpoints. 9 | 10 | ### Ring 11 | A collection of Peers arranged in a ring to facilitate distribution of storage and work. 12 | 13 | ### MDS 14 | Metadata Service. A provider of globally-consistent metadata for Peers and Clients to rely on. 15 | 16 | ### INodeStore 17 | Storage for file-system level information. 18 | 19 | ### BlockStore 20 | A low-level block storage provider. 21 | 22 | ### BlockLayer 23 | A logical list of blocks. Appears as a linear array of blocks, but may hold other data (or other blocks) as well, to enable certain redundancy, availability, and data protection properties. 24 | 25 | For example, the base block layer is just an array of blocks. The CRC block layer has a layer beneath it to represent the array of blocks, and holds a CRC hash to the side for each of the blocks as they are read or written. 26 | 27 | ### Blockset 28 | A stack of BlockLayers. 29 | 30 | -------------------------------------------------------------------------------- /Documentation/monitoring.md: -------------------------------------------------------------------------------- 1 | # Monitoring Torus 2 | 3 | ## 1) Run `torusd` with a monitor port 4 | 5 | `torusd` supports listening for HTTP requests on a monitoring port. Running with the options: 6 | 7 | ``` 8 | --host $IP --port 4321 9 | ``` 10 | 11 | Enables this functionality. When running inside a container, this is automatically done in the entrypoint script. 12 | 13 | ## 2) Set up Prometheus to monitor your cluster 14 | 15 | If you already have a Prometheus monitoring system set up, you're ready. Add these hosts and ports to be scraped. 16 | 17 | If not, [Prometheus](https://prometheus.io/) is a fantastic monitoring tool, and Torus exports all its metrics through the monitor port under the expected `/metrics` path. 18 | 19 | [Getting started with Prometheus](https://prometheus.io/docs/introduction/getting_started/) is well-documented. Adding an entry under `scrape_configs` for your `prometheus.yaml` proceeds as normal: 20 | 21 | ``` 22 | scrape_configs: 23 | # The job name is added as a label `job=` to any timeseries scraped from this config. 24 | - job_name: 'torus' 25 | 26 | # Override the global default and scrape targets from this job every 5 seconds. 27 | scrape_interval: 15s 28 | scrape_timeout: 30s 29 | 30 | target_groups: 31 | - targets: ['localhost:4321', 'localhost:4322', 'localhost:4323', 'localhost:4324'] 32 | ``` 33 | 34 | ## 3) Using grafana 35 | 36 | If you're also using [grafana](http://grafana.org/) to build dashboards on your Prometheus metrics, then you can import the default torus dashboard from the repository or release; [it lives in contrib/grafana](../contrib/grafana/grafana.json) , and customize to fit your use cases. 37 | -------------------------------------------------------------------------------- /Documentation/project-layout.md: -------------------------------------------------------------------------------- 1 | # A quick overview of the project layout 2 | 3 | ``` 4 | ├── block 5 | │   ├── aoe 6 | ``` 7 | 8 | The package for using torus as a block device. A reference example of block device volumes. 9 | `aoe` contains an implementation of an ATA-over-Ethernet server based on a block volume 10 | 11 | ``` 12 | ├── blockset 13 | ``` 14 | Implementations of the Blockset interface. 15 | 16 | 17 | ``` 18 | ├── cmd 19 | │   ├── torusd 20 | │   ├── torusblk 21 | │   ├── torusctl 22 | │   └── ringtool 23 | ``` 24 | 25 | The `main` functions that each produce a binary. `torusd` is the main server, `torusctl` manipulates and queries multiple servers through etcd, organizes volumes, and manages snapshots. `torusblk` attaches and mounts block devices, and `ringtool` is an experiment for measuring the rebalance properties of multiple rings. 26 | 27 | ``` 28 | ├── contrib 29 | │   └── kubernetes 30 | ``` 31 | 32 | Contributions, currently containing a guide for setting up torus on kubernetes 33 | 34 | ``` 35 | ├── distributor 36 | │   ├── protocols 37 | │   │   ├── adp 38 | │   │   ├── grpc 39 | │   ├── rebalance 40 | ``` 41 | 42 | Distributor is the package that implements the storage interface, but takes care of all the network requests and distribution. Therefore, it understands various peer-to-peer protocols, and how to rebalance data between other peers. 43 | 44 | ``` 45 | ├── Documentation 46 | ``` 47 | 48 | You are here! 49 | 50 | ``` 51 | ├── gc 52 | ``` 53 | A separate, small package that can be used in other goroutines to track the liveness of 54 | 55 | ``` 56 | ├── integration 57 | ``` 58 | Long-running integration tests live here. They spin up a number of virtual nodes, interact with them, and then shut them down. 59 | 60 | ``` 61 | ├── internal 62 | │   ├── http 63 | │   └── nbd 64 | ``` 65 | 66 | Packages that are specific to torus. and shouldn't be imported from the outside. `http` defines HTTP routes for torus servers/clients to host, and `nbd` is a hard fork of an NBD library (greatly cleaned up) that may, in the future, be worth splitting into a proper repository. 67 | 68 | ``` 69 | ├── metadata 70 | │   ├── etcd 71 | │   └── temp 72 | ``` 73 | 74 | `metadata` holds the implementations of the MDS interface. Currently there's an ephermeral, in-memory temp store (useful for tests) and etcd. 75 | 76 | ``` 77 | ├── models 78 | ``` 79 | 80 | Protobufs for serialization and deserialization. 81 | 82 | ``` 83 | ├── ring 84 | ``` 85 | 86 | Implementations of the consistent hash ring interface. 87 | 88 | ``` 89 | ├── storage 90 | ``` 91 | Implementations of underlying storage engines (mmap files, temporary map, potentially bare disks, etc) 92 | 93 | -------------------------------------------------------------------------------- /Documentation/research-links.md: -------------------------------------------------------------------------------- 1 | Various things that we might want to find again: 2 | 3 | * [A nice breakdown on storage (some years ago) from Google](http://static.googleusercontent.com/media/research.google.com/en//university/relations/facultysummit2010/storage_architecture_and_challenges.pdf) 4 | 5 | * [Article on Colossus](http://highscalability.com/blog/2010/9/11/googles-colossus-makes-search-real-time-by-dumping-mapreduce.html) 6 | 7 | * [Another article on Colossus](http://www.theregister.co.uk/2009/08/12/google_file_system_part_deux/) TL;DR: Blocksize of 1MB is reasonable. Aiming for 8K is overkill. 8 | 9 | * [GFS: Evolution on Fast-forward](http://queue.acm.org/detail.cfm?id=1594206) 10 | 11 | * [Roaring Bitmaps](http://arxiv.org/abs/1402.6407) have a nice Go implementation and can be serialized into etcd. 12 | 13 | * [Mentions of Colossus](https://github.com/cockroachdb/cockroach/issues/243#issuecomment-91575792) at Cockroach 14 | 15 | * [QFS Paper](https://drive.google.com/file/d/0Bz6uqmjs5anRVU9ROThpd2hVd2s/view?usp=sharing) 16 | 17 | * [QFS Github](https://github.com/quantcast/qfs/wiki) 18 | 19 | Erasure encoding 20 | 21 | * [Erasure encoding on HDFS](https://drive.google.com/file/d/0B9_MJMolBE71cWMxeU1zc25WQjg/view?usp=sharing) 22 | -------------------------------------------------------------------------------- /Documentation/snapshotting.md: -------------------------------------------------------------------------------- 1 | # Snapshotting Torus Block Volumes 2 | 3 | Snapshots of Torus Block Volumes can be taken at any time, as of the last sync() of the volume, even while mounted. Snapshots are [Copy on Write](https://en.wikipedia.org/wiki/Copy-on-write) and don't require a full copy; only the storage used in the past and any updates to the storage will count toward the total usage. 4 | 5 | ## Create a snapshot 6 | 7 | ``` 8 | torusctl block snapshot create myVolume@mySnapshotName 9 | ``` 10 | 11 | Creates a snapshot of the current state of myVolume called mySnapshotName. 12 | Remember that this operation is 'free' and does not require locking the volume. 13 | To list all current snapshots, use: 14 | 15 | ``` 16 | torusctl block snapshot list myVolume 17 | ``` 18 | 19 | Which will return something like: 20 | ``` 21 | Volume: myVolume 22 | SNAPSHOT NAME TIMESTAMP 23 | bar 2016-06-22T13:31:06-07:00 24 | foo 2016-06-22T13:31:04-07:00 25 | ``` 26 | 27 | ## Delete a snapshot 28 | 29 | ``` 30 | torusctl block snapshot delete myVolume@mySnapshotName 31 | ``` 32 | 33 | Data that's unused will then be freed. 34 | 35 | ## Restore a snapshot 36 | 37 | This operation, because it changes the state of the volume, requires that myVolume be unmounted. 38 | 39 | ``` 40 | torusctl block snapshot restore myVolume@mySnapshotName 41 | ``` 42 | -------------------------------------------------------------------------------- /Documentation/torus-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coreos/torus/b53b701fc3af89fdf5f2999ebbc3b778e8c18ccc/Documentation/torus-overview.png -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Barak Michener (@barakmich) pkg: * 2 | Andrew Hodges (@betawaffle) pkg: * 3 | Nick Owens (@mischief) pkg: * 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifeq ($(origin VERSION), undefined) 2 | VERSION != git rev-parse --short HEAD 3 | endif 4 | HOST_GOOS=$(shell go env GOOS) 5 | HOST_GOARCH=$(shell go env GOARCH) 6 | REPOPATH = github.com/coreos/torus 7 | 8 | VERBOSE_1 := -v 9 | VERBOSE_2 := -v -x 10 | 11 | WHAT := torusd torusctl torusblk 12 | 13 | build: vendor 14 | for target in $(WHAT); do \ 15 | $(BUILD_ENV_FLAGS) go build $(VERBOSE_$(V)) -o bin/$$target -ldflags "-X $(REPOPATH).Version=$(VERSION)" ./cmd/$$target; \ 16 | done 17 | 18 | test: tools/glide 19 | go test --race $(shell ./tools/glide novendor) 20 | 21 | vet: tools/glide 22 | go vet $(shell ./tools/glide novendor) 23 | 24 | fmt: tools/glide 25 | go fmt $(shell ./tools/glide novendor) 26 | 27 | run: 28 | ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --peer-address http://127.0.0.1:40000 29 | 30 | clean: 31 | rm -rf ./local-cluster ./bin/torus* 32 | 33 | cleanall: clean 34 | rm -rf /tmp/etcd bin tools vendor 35 | 36 | etcdrun: 37 | ./local/etcd/etcd --data-dir /tmp/etcd 38 | 39 | run3: 40 | goreman start 41 | 42 | release: releasetar 43 | goxc -d ./release -tasks-=go-vet,go-test -os="linux darwin" -pv=$(VERSION) -arch="386 amd64 arm arm64" -build-ldflags="-X $(REPOPATH).Version=$(VERSION)" -resources-include="README.md,Documentation,LICENSE,contrib" -main-dirs-exclude="vendor,cmd/ringtool" 44 | 45 | releasetar: 46 | mkdir -p release/$(VERSION) 47 | glide install --strip-vcs --strip-vendor --update-vendored --delete 48 | glide-vc --only-code --no-tests --keep="**/*.json.in" 49 | git ls-files > /tmp/torusbuild 50 | find vendor >> /tmp/torusbuild 51 | tar -cvf release/$(VERSION)/torus_$(VERSION)_src.tar -T /tmp/torusbuild --transform 's,^,torus_$(VERSION)/,' 52 | rm /tmp/torusbuild 53 | gzip release/$(VERSION)/torus_$(VERSION)_src.tar 54 | 55 | 56 | vendor: tools/glide 57 | ./tools/glide install 58 | 59 | tools/glide: 60 | @echo "Downloading glide" 61 | mkdir -p tools 62 | curl -L https://github.com/Masterminds/glide/releases/download/0.10.2/glide-0.10.2-$(HOST_GOOS)-$(HOST_GOARCH).tar.gz | tar -xz -C tools 63 | mv tools/$(HOST_GOOS)-$(HOST_GOARCH)/glide tools/glide 64 | rm -r tools/$(HOST_GOOS)-$(HOST_GOARCH) 65 | 66 | help: 67 | @echo "Influential make variables" 68 | @echo " V - Build verbosity {0,1,2}." 69 | @echo " BUILD_ENV_FLAGS - Environment added to 'go build'." 70 | @echo " WHAT - Command to build. (e.g. WHAT=torusctl)" 71 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | torus1: ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --port 4321 --data-dir local-cluster/torus1 --peer-address http://127.0.0.1:40000 --size 5GiB --auto-join --host 127.0.0.1 2 | torus2: ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --port 4322 --data-dir local-cluster/torus2 --peer-address http://127.0.0.1:40001 --size 5GiB --write-level one --auto-join --host 127.0.0.1 3 | torus3: ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --port 4323 --data-dir local-cluster/torus3 --peer-address http://127.0.0.1:40002 --size 5GiB --read-cache-size=200MiB --auto-join --host 127.0.0.1 4 | #torus4: ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --port 4324 --data-dir local-cluster/torus4 --peer-address 127.0.0.1:40003 --size 5GiB --read-cache-size=200MiB --auto-join 5 | #torus5: ./bin/torusd --etcd 127.0.0.1:2379 --debug --debug-init --port 4325 --data-dir local-cluster/torus5 --peer-address 127.0.0.1:40004 --size 5GiB --read-cache-size=200MiB --auto-join 6 | #torusblk: sudo ./bin/torusblk nbd blockvol /dev/nbd2 --write-level local --write-cache-size 1GiB 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Torus 2 | [![Build Status](https://travis-ci.org/coreos/torus.svg?branch=master)](https://travis-ci.org/coreos/torus) 3 | [![Go Report Card](https://goreportcard.com/badge/github.com/coreos/torus)](https://goreportcard.com/report/github.com/coreos/torus) 4 | [![GoDoc](https://godoc.org/github.com/coreos/torus?status.svg)](https://godoc.org/github.com/coreos/torus) 5 | 6 | ## Torus Overview 7 | 8 | Torus is an open source project for distributed storage coordinated through [etcd](https://github.com/coreos/etcd). 9 | 10 | Torus provides a resource pool and basic file primitives from a set of daemons running atop multiple nodes. These primitives are made consistent by being append-only and coordinated by [etcd](https://github.com/coreos/etcd). From these primitives, a Torus server can support multiple types of volumes, the semantics of which can be broken into subprojects. It ships with a simple block-device volume plugin, but is extensible to more. 11 | 12 | ![Quick-glance overview](Documentation/torus-overview.png) 13 | 14 | Sharding is done via a consistent hash function, controlled in the simple case by a hash ring algorithm, but fully extensible to arbitrary maps, rack-awareness, and other nice features. The project name comes from this: a hash 'ring' plus a 'volume' is a torus. 15 | 16 | ## Project Status 17 | 18 | Development on Torus at CoreOS stopped as of Feb 2017. We started [Torus as a prototype](https://coreos.com/blog/torus-distributed-storage-by-coreos.html) in June 2016 to build a storage system that could be easily operated on top of Kubernetes. We have proven out that model with this project. But, we didn't achieve the development velocity over the 8 months that we had hoped for when we started out, and as such we didn't achieve the depth of community engagement we had hoped for either. 19 | 20 | If you have immediate storage needs Kubernetes can plugin to [dozens of other storage options](https://kubernetes.io/docs/user-guide/volumes/) including AWS/Azure/Google/OpenStack/etc block storage, Ceph, Gluster, NFS, etc that are external to Kubernetes. 21 | 22 | We are also seeing the emergence of projects, like [rook](https://github.com/rook/rook/tree/master/demo/kubernetes), which creates a storage system that is ran on top of Kubernetes, as an [Operator](https://coreos.com/blog/introducing-operators.html). We expect to see more systems like this in the future, because Kubernetes is a perfect platform for running distributed storage systems. 23 | 24 | If you are interested in continuing the project feel free to fork and continue; we can update this README if a particular fork gets solid traction. 25 | 26 | For further questions email brandon.philips@coreos.com. 27 | 28 | ## Trying out Torus 29 | 30 | To get started quicky using Torus for the first time, start with the guide to [running your first Torus cluster](Documentation/getting-started.md), learn more about setting up Torus on Kubernetes using FlexVolumes [in contrib](contrib/kubernetes), or create a Torus cluster on [bare metal](https://github.com/coreos/coreos-baremetal/blob/master/Documentation/torus.md). 31 | 32 | ## Contributing to Torus 33 | 34 | Torus is an open source project and contributors are welcome! 35 | Join us on IRC at [#coreos on freenode.net](http://webchat.freenode.net/?channels=%23coreos&uio=d4), [file an issue](https://github.com/coreos/torus/issues) here on Github, check out bigger plans on the [kind/design](https://github.com/coreos/torus/labels/kind%2Fdesign) tag, contribute on bugs that are [low hanging fruit](https://github.com/coreos/torus/labels/low%20hanging%20fruit) for issue ideas and check the [project layout](Documentation/project-layout.md) for a guide to the sections that might interest you. 36 | 37 | ## Licensing 38 | 39 | Unless otherwise noted, all code in the Torus repository is licensed under the [Apache 2.0 license](LICENSE). Some portions of the codebase are derived from other projects under different licenses; the appropriate information can be found in the header of those source files, as applicable. 40 | -------------------------------------------------------------------------------- /block/aoe/device.go: -------------------------------------------------------------------------------- 1 | package aoe 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "io" 7 | 8 | "github.com/coreos/torus/block" 9 | 10 | "github.com/mdlayher/aoe" 11 | ) 12 | 13 | var ( 14 | _ Device = &FileDevice{} 15 | ) 16 | 17 | type Device interface { 18 | io.Closer 19 | io.ReadWriteSeeker 20 | Sync() error 21 | aoe.Identifier 22 | } 23 | 24 | // note: ATA 'words' are 16 bits, so all byte offsets are multiplied 25 | // by 2. 26 | 27 | // encode a 28 bit LBA 28 | func lba28(s int64) [4]byte { 29 | l := [4]byte{} 30 | 31 | l[0] = byte(s) 32 | s = s >> 8 33 | l[1] = byte(s) 34 | s = s >> 8 35 | l[2] = byte(s) 36 | s = s >> 8 37 | l[3] = byte(s) & 0xf 38 | 39 | return l 40 | } 41 | 42 | // encode a 48 bit LBA 43 | func lba48(s int64) [6]byte { 44 | l := [6]byte{} 45 | 46 | l[0] = byte(s) 47 | s = s >> 8 48 | l[1] = byte(s) 49 | s = s >> 8 50 | l[2] = byte(s) 51 | s = s >> 8 52 | l[3] = byte(s) 53 | s = s >> 8 54 | l[4] = byte(s) 55 | s = s >> 8 56 | l[5] = byte(s) 57 | 58 | return l 59 | } 60 | 61 | // poke a short into p at off 62 | func pshort(p []byte, off int, v uint16) { 63 | off = off * 2 64 | binary.LittleEndian.PutUint16(p[off:off+2], v) 65 | } 66 | 67 | // poke a string into p at off and pad with space 68 | func pstring(p []byte, off int, sz int, ident string) { 69 | if sz%2 != 0 { 70 | panic("can't encode odd length string") 71 | } 72 | 73 | id := make([]byte, sz) 74 | for i := 0; i < len(id); i++ { 75 | id[i] = byte(' ') 76 | } 77 | copy(id, []byte(ident)) 78 | for i := 0; i < len(id); i += 2 { 79 | id[i], id[i+1] = id[i+1], id[i] 80 | } 81 | 82 | copy(p[off*2:], id) 83 | } 84 | 85 | type FileDevice struct { 86 | *block.BlockFile 87 | } 88 | 89 | func (fd *FileDevice) Sectors() (int64, error) { 90 | fi := fd.BlockFile.Size() 91 | if fi == 0 { 92 | return 0, errors.New("empty file device?") 93 | } 94 | 95 | return int64(fi) / 512, nil 96 | } 97 | 98 | func (fd *FileDevice) Identify() ([512]byte, error) { 99 | bufa := [512]byte{} 100 | 101 | sectors, err := fd.Sectors() 102 | if err != nil { 103 | return bufa, err 104 | } 105 | 106 | buf := bufa[:] 107 | 108 | pshort(buf, 47, 0x8000) 109 | pshort(buf, 49, 0x0200) 110 | pshort(buf, 50, 0x4000) 111 | 112 | // PIO mode 4 113 | pshort(buf, 53, 0x0002) 114 | pshort(buf, 64, 0x0002) 115 | 116 | // claim ATA8-ACS support 117 | pshort(buf, 80, 0x00F0) 118 | 119 | pshort(buf, 83, 0x5400) 120 | pshort(buf, 84, 0x4000) 121 | pshort(buf, 86, 0x1400) 122 | pshort(buf, 87, 0x4000) 123 | pshort(buf, 93, 0x400b) 124 | 125 | // we support DRAT 126 | pshort(buf, 69, 0x4000) 127 | // we support TRIM 128 | pshort(buf, 169, 0x0001) 129 | 130 | // Serial number 131 | pstring(buf, 10, 20, "0") 132 | 133 | // Firmware revision 134 | pstring(buf, 23, 8, "V0") 135 | 136 | // Model number 137 | pstring(buf, 27, 40, "torus AoE") 138 | 139 | l28 := lba28(sectors) 140 | // 28-bit LBA sectors 141 | copy(buf[60*2:], l28[:]) 142 | l48 := lba48(sectors) 143 | // 48-bit LBA sectors 144 | copy(buf[100*2:], l48[:]) 145 | 146 | return bufa, nil 147 | } 148 | -------------------------------------------------------------------------------- /block/aoe/frame.go: -------------------------------------------------------------------------------- 1 | package aoe 2 | 3 | import ( 4 | "net" 5 | 6 | "github.com/mdlayher/aoe" 7 | "github.com/mdlayher/ethernet" 8 | "github.com/mdlayher/raw" 9 | ) 10 | 11 | type Frame struct { 12 | // received ethernet frame 13 | ethernet.Frame 14 | // received AoE header 15 | aoe.Header 16 | } 17 | 18 | func (f *Frame) UnmarshalBinary(data []byte) error { 19 | if err := f.Frame.UnmarshalBinary(data); err != nil { 20 | return err 21 | } 22 | 23 | if err := f.Header.UnmarshalBinary(f.Frame.Payload); err != nil { 24 | return err 25 | } 26 | 27 | return nil 28 | } 29 | 30 | type WriterTo interface { 31 | WriteTo(b []byte, addr net.Addr) (n int, err error) 32 | } 33 | 34 | type FrameSender struct { 35 | orig *Frame 36 | dst net.HardwareAddr 37 | src net.HardwareAddr 38 | conn WriterTo 39 | 40 | major uint16 41 | minor uint8 42 | } 43 | 44 | func (fs *FrameSender) Send(hdr *aoe.Header) (int, error) { 45 | hdr.Version = 1 46 | hdr.FlagResponse = true 47 | hdr.Major = fs.major 48 | hdr.Minor = fs.minor 49 | hdr.Tag = fs.orig.Tag 50 | 51 | hbuf, err := hdr.MarshalBinary() 52 | if err != nil { 53 | panic(err) 54 | } 55 | 56 | frame := ðernet.Frame{ 57 | Destination: fs.dst, 58 | Source: fs.src, 59 | EtherType: aoe.EtherType, 60 | Payload: hbuf, 61 | } 62 | 63 | ebuf, err := frame.MarshalBinary() 64 | if err != nil { 65 | panic(err) 66 | } 67 | 68 | clog.Tracef("send: %d %s %+v", len(ebuf), fs.dst, hdr) 69 | clog.Tracef("send arg: %+v", hdr.Arg) 70 | 71 | return fs.conn.WriteTo(ebuf, &raw.Addr{HardwareAddr: fs.dst}) 72 | } 73 | 74 | func (fs *FrameSender) SendError(aerr aoe.Error) (int, error) { 75 | hdr := fs.orig.Header 76 | hdr.FlagError = true 77 | hdr.Error = aerr 78 | 79 | return fs.Send(&hdr) 80 | } 81 | -------------------------------------------------------------------------------- /block/aoe/interface.go: -------------------------------------------------------------------------------- 1 | package aoe 2 | 3 | import ( 4 | "net" 5 | 6 | "github.com/mdlayher/raw" 7 | ) 8 | 9 | // implements net.PacketConn 10 | type Interface struct { 11 | *net.Interface 12 | net.PacketConn 13 | } 14 | 15 | func NewInterface(ifname string) (*Interface, error) { 16 | ifc, err := net.InterfaceByName(ifname) 17 | if err != nil { 18 | return nil, err 19 | } 20 | 21 | pc, err := raw.ListenPacket(ifc, raw.ProtocolAoE) 22 | if err != nil { 23 | return nil, err 24 | } 25 | 26 | ai := &Interface{ifc, pc} 27 | return ai, nil 28 | } 29 | -------------------------------------------------------------------------------- /block/aoe/interface_test.go: -------------------------------------------------------------------------------- 1 | package aoe 2 | 3 | import ( 4 | "net" 5 | "sync" 6 | "syscall" 7 | "testing" 8 | ) 9 | 10 | func TestInterfaceClose(t *testing.T) { 11 | ai, err := NewInterface(findLoopbackInterface(t)) 12 | if err != nil { 13 | if err == syscall.EPERM { 14 | t.Skipf("test must be run as root: %v", err) 15 | } 16 | t.Fatalf("%T %+v", err, err) 17 | } 18 | 19 | var wg sync.WaitGroup 20 | 21 | wg.Add(1) 22 | go func() { 23 | for { 24 | b := make([]byte, 1) 25 | _, _, err := ai.ReadFrom(b) 26 | if err != nil { 27 | t.Errorf("%T %+v", err, err) 28 | break 29 | } 30 | } 31 | wg.Done() 32 | }() 33 | 34 | if err := ai.Close(); err != nil { 35 | t.Fatal(err) 36 | } 37 | 38 | wg.Wait() 39 | } 40 | 41 | func findLoopbackInterface(t *testing.T) string { 42 | for _, name := range []string{"lo", "lo0"} { 43 | if _, err := net.InterfaceByName(name); err == nil { 44 | return name 45 | } 46 | } 47 | 48 | t.Skip("could not find suitable loopback interface for test") 49 | return "" 50 | } 51 | -------------------------------------------------------------------------------- /block/blockfile.go: -------------------------------------------------------------------------------- 1 | package block 2 | 3 | import ( 4 | "github.com/coreos/torus" 5 | "github.com/coreos/torus/blockset" 6 | "golang.org/x/net/context" 7 | ) 8 | 9 | type BlockFile struct { 10 | *torus.File 11 | vol *BlockVolume 12 | } 13 | 14 | func (s *BlockVolume) OpenBlockFile() (file *BlockFile, err error) { 15 | if s.volume.Type != VolumeType { 16 | panic("Wrong type") 17 | } 18 | if err = s.mds.Lock(s.srv.Lease()); err != nil { 19 | return nil, err 20 | } 21 | defer func() { 22 | // If this function returns an error, attempt to release the lock. 23 | // TODO: Log unlock errors? 24 | if err != nil { 25 | s.mds.Unlock() 26 | } 27 | }() 28 | ref, err := s.mds.GetINode() 29 | if err != nil { 30 | return nil, err 31 | } 32 | inode, err := s.getOrCreateBlockINode(ref) 33 | if err != nil { 34 | return nil, err 35 | } 36 | bs, err := blockset.UnmarshalFromProto(inode.GetBlocks(), s.srv.Blocks) 37 | if err != nil { 38 | return nil, err 39 | } 40 | f, err := s.srv.CreateFile(s.volume, inode, bs) 41 | if err != nil { 42 | return nil, err 43 | } 44 | return &BlockFile{ 45 | File: f, 46 | vol: s, 47 | }, nil 48 | } 49 | 50 | func (s *BlockVolume) OpenSnapshot(name string) (*BlockFile, error) { 51 | if s.volume.Type != VolumeType { 52 | panic("wrong type") 53 | } 54 | snaps, err := s.mds.GetSnapshots() 55 | if err != nil { 56 | return nil, err 57 | } 58 | var found Snapshot 59 | for _, x := range snaps { 60 | if x.Name == name { 61 | found = x 62 | break 63 | } 64 | } 65 | if found.Name != name { 66 | return nil, torus.ErrNotExist 67 | } 68 | ref := torus.INodeRefFromBytes(found.INodeRef) 69 | inode, err := s.getOrCreateBlockINode(ref) 70 | if err != nil { 71 | return nil, err 72 | } 73 | bs, err := blockset.UnmarshalFromProto(inode.GetBlocks(), s.srv.Blocks) 74 | if err != nil { 75 | return nil, err 76 | } 77 | f, err := s.srv.CreateFile(s.volume, inode, bs) 78 | if err != nil { 79 | return nil, err 80 | } 81 | f.ReadOnly = true 82 | return &BlockFile{ 83 | File: f, 84 | vol: s, 85 | }, nil 86 | } 87 | 88 | func (s *BlockVolume) RestoreSnapshot(name string) (err error) { 89 | if s.volume.Type != VolumeType { 90 | panic("Wrong type") 91 | } 92 | if err = s.mds.Lock(s.srv.Lease()); err != nil { 93 | return err 94 | } 95 | defer s.mds.Unlock() 96 | snaps, err := s.mds.GetSnapshots() 97 | if err != nil { 98 | return err 99 | } 100 | var found Snapshot 101 | for _, x := range snaps { 102 | if x.Name == name { 103 | found = x 104 | break 105 | } 106 | } 107 | if found.Name != name { 108 | return torus.ErrNotExist 109 | } 110 | ref := torus.INodeRefFromBytes(found.INodeRef) 111 | return s.mds.SyncINode(ref) 112 | } 113 | 114 | func (f *BlockFile) Close() (err error) { 115 | defer func() { 116 | // No matter what attempt to release the lock. 117 | unlockErr := f.vol.mds.Unlock() 118 | if err == nil { 119 | // TODO: Log unlock errors if err is not nil? 120 | err = unlockErr 121 | } 122 | }() 123 | 124 | if err = f.Sync(); err != nil { 125 | return err 126 | } 127 | return f.File.Close() 128 | } 129 | 130 | func (f *BlockFile) inodeContext() context.Context { 131 | return context.WithValue(context.TODO(), torus.CtxWriteLevel, torus.WriteAll) 132 | } 133 | 134 | func (f *BlockFile) Sync() error { 135 | if !f.WriteOpen() { 136 | clog.Debugf("not syncing") 137 | return nil 138 | } 139 | clog.Debugf("Syncing block volume: %v", f.vol.volume.Name) 140 | err := f.File.SyncBlocks() 141 | if err != nil { 142 | return err 143 | } 144 | ref, err := f.File.SyncINode(f.inodeContext()) 145 | if err != nil { 146 | return err 147 | } 148 | return f.vol.mds.SyncINode(ref) 149 | } 150 | -------------------------------------------------------------------------------- /block/doc.go: -------------------------------------------------------------------------------- 1 | // block provides the implementation of the "block" volume type, using a Torus file as a block device. 2 | package block 3 | -------------------------------------------------------------------------------- /block/gc_block_vol.go: -------------------------------------------------------------------------------- 1 | package block 2 | 3 | import ( 4 | "time" 5 | 6 | "golang.org/x/net/context" 7 | 8 | "github.com/coreos/pkg/capnslog" 9 | "github.com/coreos/torus" 10 | "github.com/coreos/torus/blockset" 11 | "github.com/coreos/torus/gc" 12 | "github.com/coreos/torus/models" 13 | ) 14 | 15 | func init() { 16 | gc.RegisterGC("blockvol", NewBlockVolGC) 17 | } 18 | 19 | type blockvolGC struct { 20 | srv *torus.Server 21 | inodes gc.INodeFetcher 22 | set map[torus.BlockRef]bool 23 | highwaters map[torus.VolumeID]torus.INodeID 24 | curINodes []torus.INodeRef 25 | } 26 | 27 | func NewBlockVolGC(srv *torus.Server, inodes gc.INodeFetcher) (gc.GC, error) { 28 | b := &blockvolGC{ 29 | srv: srv, 30 | inodes: inodes, 31 | } 32 | b.Clear() 33 | return b, nil 34 | } 35 | 36 | func (b *blockvolGC) getContext() context.Context { 37 | ctx, _ := context.WithTimeout(context.TODO(), 2*time.Second) 38 | return b.srv.ExtendContext(ctx) 39 | } 40 | 41 | func (b *blockvolGC) PrepVolume(vol *models.Volume) error { 42 | if vol.Type != VolumeType { 43 | return nil 44 | } 45 | mds, err := createBlockMetadata(b.srv.MDS, vol.Name, torus.VolumeID(vol.Id)) 46 | if err != nil { 47 | return err 48 | } 49 | curRef, err := mds.GetINode() 50 | if err != nil { 51 | return err 52 | } 53 | b.highwaters[curRef.Volume()] = 0 54 | if curRef.INode <= 1 { 55 | return nil 56 | } 57 | 58 | snaps, err := mds.GetSnapshots() 59 | if err != nil { 60 | return err 61 | } 62 | 63 | curINodes := make([]torus.INodeRef, 0, len(snaps)+1) 64 | curINodes = append(curINodes, curRef) 65 | for _, x := range snaps { 66 | curINodes = append(curINodes, torus.INodeRefFromBytes(x.INodeRef)) 67 | } 68 | 69 | for _, x := range curINodes { 70 | inode, err := b.inodes.GetINode(b.getContext(), x) 71 | if err != nil { 72 | return err 73 | } 74 | set, err := blockset.UnmarshalFromProto(inode.Blocks, nil) 75 | if err != nil { 76 | return err 77 | } 78 | refs := set.GetAllBlockRefs() 79 | for _, ref := range refs { 80 | if ref.IsZero() { 81 | continue 82 | } 83 | if ref.INode > b.highwaters[ref.Volume()] { 84 | b.highwaters[ref.Volume()] = ref.INode 85 | } 86 | b.set[ref] = true 87 | } 88 | } 89 | b.curINodes = append(b.curINodes, curINodes...) 90 | return nil 91 | } 92 | 93 | func (b *blockvolGC) IsDead(ref torus.BlockRef) bool { 94 | v, ok := b.highwaters[ref.Volume()] 95 | if !ok { 96 | if clog.LevelAt(capnslog.TRACE) { 97 | clog.Tracef("%s doesn't exist anymore", ref) 98 | } 99 | // Volume doesn't exist anymore 100 | return true 101 | } 102 | // If it's a new block or INode, let it be. 103 | if ref.INode >= v { 104 | if clog.LevelAt(capnslog.TRACE) { 105 | clog.Tracef("%s is new compared to %d", ref, v) 106 | } 107 | return false 108 | } 109 | // If it's an INode block, and it's not in our list 110 | if ref.BlockType() == torus.TypeINode { 111 | for _, x := range b.curINodes { 112 | if ref.HasINode(x, torus.TypeINode) { 113 | if clog.LevelAt(capnslog.TRACE) { 114 | clog.Tracef("%s is in %s", ref, x) 115 | } 116 | return false 117 | } 118 | } 119 | if clog.LevelAt(capnslog.TRACE) { 120 | clog.Tracef("%s is a dead INode", ref) 121 | } 122 | return true 123 | } 124 | // If it's a data block 125 | if v := b.set[ref]; v { 126 | return false 127 | } 128 | if clog.LevelAt(capnslog.TRACE) { 129 | clog.Tracef("%s is dead", ref) 130 | } 131 | return true 132 | } 133 | 134 | func (b *blockvolGC) Clear() { 135 | b.highwaters = make(map[torus.VolumeID]torus.INodeID) 136 | b.curINodes = make([]torus.INodeRef, 0, len(b.curINodes)) 137 | b.set = make(map[torus.BlockRef]bool) 138 | } 139 | -------------------------------------------------------------------------------- /block/metadata.go: -------------------------------------------------------------------------------- 1 | package block 2 | 3 | import ( 4 | "errors" 5 | "time" 6 | 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/models" 10 | ) 11 | 12 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "block") 13 | 14 | type Snapshot struct { 15 | Name string 16 | When time.Time 17 | INodeRef []byte 18 | } 19 | 20 | type blockMetadata interface { 21 | torus.MetadataService 22 | 23 | Lock(lease int64) error 24 | Unlock() error 25 | 26 | GetINode() (torus.INodeRef, error) 27 | SyncINode(torus.INodeRef) error 28 | 29 | CreateBlockVolume(vol *models.Volume) error 30 | DeleteVolume() error 31 | 32 | SaveSnapshot(name string) error 33 | GetSnapshots() ([]Snapshot, error) 34 | DeleteSnapshot(name string) error 35 | } 36 | 37 | func createBlockMetadata(mds torus.MetadataService, name string, vid torus.VolumeID) (blockMetadata, error) { 38 | switch mds.Kind() { 39 | case torus.EtcdMetadata: 40 | return createBlockEtcdMetadata(mds, name, vid) 41 | case torus.TempMetadata: 42 | return createBlockTempMetadata(mds, name, vid) 43 | default: 44 | return nil, errors.New("unimplemented for this kind of metadata") 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /block/temp.go: -------------------------------------------------------------------------------- 1 | package block 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/coreos/torus" 8 | "github.com/coreos/torus/metadata/temp" 9 | "github.com/coreos/torus/models" 10 | ) 11 | 12 | type blockTempMetadata struct { 13 | *temp.Client 14 | name string 15 | vid torus.VolumeID 16 | } 17 | 18 | type blockTempVolumeData struct { 19 | locked string 20 | id torus.INodeRef 21 | snaps []Snapshot 22 | } 23 | 24 | func (b *blockTempMetadata) CreateBlockVolume(volume *models.Volume) error { 25 | b.LockData() 26 | defer b.UnlockData() 27 | _, ok := b.GetData(fmt.Sprint(volume.Id)) 28 | if ok { 29 | return torus.ErrExists 30 | } 31 | b.CreateVolume(volume) 32 | b.SetData(fmt.Sprint(volume.Id), &blockTempVolumeData{ 33 | locked: "", 34 | id: torus.NewINodeRef(torus.VolumeID(volume.Id), 1), 35 | }) 36 | return nil 37 | } 38 | 39 | func (b *blockTempMetadata) Lock(lease int64) error { 40 | b.LockData() 41 | defer b.UnlockData() 42 | v, ok := b.GetData(fmt.Sprint(b.vid)) 43 | if !ok { 44 | return torus.ErrNotExist 45 | } 46 | d := v.(*blockTempVolumeData) 47 | if d.locked != "" { 48 | return torus.ErrLocked 49 | } 50 | d.locked = b.UUID() 51 | return nil 52 | } 53 | 54 | func (b *blockTempMetadata) GetINode() (torus.INodeRef, error) { 55 | b.LockData() 56 | defer b.UnlockData() 57 | v, ok := b.GetData(fmt.Sprint(b.vid)) 58 | if !ok { 59 | return torus.ZeroINode(), torus.ErrNotExist 60 | } 61 | d := v.(*blockTempVolumeData) 62 | return d.id, nil 63 | } 64 | 65 | func (b *blockTempMetadata) SyncINode(inode torus.INodeRef) error { 66 | b.LockData() 67 | defer b.UnlockData() 68 | v, ok := b.GetData(fmt.Sprint(b.vid)) 69 | if !ok { 70 | return torus.ErrNotExist 71 | } 72 | d := v.(*blockTempVolumeData) 73 | if d.locked != b.UUID() { 74 | return torus.ErrLocked 75 | } 76 | d.id = inode 77 | return nil 78 | } 79 | 80 | func (b *blockTempMetadata) Unlock() error { 81 | b.LockData() 82 | defer b.UnlockData() 83 | v, ok := b.GetData(fmt.Sprint(b.vid)) 84 | if !ok { 85 | return torus.ErrNotExist 86 | } 87 | d := v.(*blockTempVolumeData) 88 | if d.locked != b.UUID() { 89 | return torus.ErrLocked 90 | } 91 | d.locked = "" 92 | return nil 93 | } 94 | 95 | func (b *blockTempMetadata) DeleteVolume() error { 96 | b.LockData() 97 | defer b.UnlockData() 98 | v, ok := b.GetData(fmt.Sprint(b.vid)) 99 | if !ok { 100 | return torus.ErrNotExist 101 | } 102 | d := v.(*blockTempVolumeData) 103 | if d.locked != b.UUID() { 104 | return torus.ErrLocked 105 | } 106 | return b.Client.DeleteVolume(b.name) 107 | } 108 | 109 | func (b *blockTempMetadata) SaveSnapshot(name string) error { 110 | b.LockData() 111 | defer b.UnlockData() 112 | v, ok := b.GetData(fmt.Sprint(b.vid)) 113 | if !ok { 114 | return torus.ErrNotExist 115 | } 116 | d := v.(*blockTempVolumeData) 117 | for _, x := range d.snaps { 118 | if x.Name == name { 119 | return torus.ErrExists 120 | } 121 | } 122 | snap := Snapshot{ 123 | Name: name, 124 | When: time.Now(), 125 | INodeRef: d.id.ToBytes(), 126 | } 127 | d.snaps = append(d.snaps, snap) 128 | return nil 129 | } 130 | 131 | func (b *blockTempMetadata) GetSnapshots() ([]Snapshot, error) { 132 | b.LockData() 133 | defer b.UnlockData() 134 | v, ok := b.GetData(fmt.Sprint(b.vid)) 135 | if !ok { 136 | return nil, torus.ErrNotExist 137 | } 138 | d := v.(*blockTempVolumeData) 139 | out := make([]Snapshot, len(d.snaps)) 140 | copy(out, d.snaps) 141 | return out, nil 142 | } 143 | func (b *blockTempMetadata) DeleteSnapshot(name string) error { 144 | b.LockData() 145 | defer b.UnlockData() 146 | v, ok := b.GetData(fmt.Sprint(b.vid)) 147 | if !ok { 148 | return torus.ErrNotExist 149 | } 150 | d := v.(*blockTempVolumeData) 151 | for i, x := range d.snaps { 152 | if x.Name == name { 153 | d.snaps = append(d.snaps[:i], d.snaps[i+1:]...) 154 | return nil 155 | } 156 | } 157 | return torus.ErrNotExist 158 | } 159 | 160 | func createBlockTempMetadata(mds torus.MetadataService, name string, vid torus.VolumeID) (blockMetadata, error) { 161 | if t, ok := mds.(*temp.Client); ok { 162 | return &blockTempMetadata{ 163 | Client: t, 164 | name: name, 165 | vid: vid, 166 | }, nil 167 | } 168 | panic("how are we creating a temp metadata that doesn't implement it but reports as being temp") 169 | } 170 | -------------------------------------------------------------------------------- /block/volume.go: -------------------------------------------------------------------------------- 1 | package block 2 | 3 | import ( 4 | "github.com/coreos/torus" 5 | "github.com/coreos/torus/blockset" 6 | "github.com/coreos/torus/models" 7 | "golang.org/x/net/context" 8 | ) 9 | 10 | const VolumeType = "block" 11 | 12 | type BlockVolume struct { 13 | srv *torus.Server 14 | mds blockMetadata 15 | volume *models.Volume 16 | } 17 | 18 | func CreateBlockVolume(mds torus.MetadataService, volume string, size uint64) error { 19 | id, err := mds.NewVolumeID() 20 | if err != nil { 21 | return err 22 | } 23 | blkmd, err := createBlockMetadata(mds, volume, id) 24 | if err != nil { 25 | return err 26 | } 27 | return blkmd.CreateBlockVolume(&models.Volume{ 28 | Name: volume, 29 | Id: uint64(id), 30 | Type: VolumeType, 31 | MaxBytes: size, 32 | }) 33 | } 34 | 35 | func OpenBlockVolume(s *torus.Server, volume string) (*BlockVolume, error) { 36 | vol, err := s.MDS.GetVolume(volume) 37 | if err != nil { 38 | return nil, err 39 | } 40 | mds, err := createBlockMetadata(s.MDS, vol.Name, torus.VolumeID(vol.Id)) 41 | if err != nil { 42 | return nil, err 43 | } 44 | return &BlockVolume{ 45 | srv: s, 46 | mds: mds, 47 | volume: vol, 48 | }, nil 49 | } 50 | 51 | func DeleteBlockVolume(mds torus.MetadataService, volume string) error { 52 | vol, err := mds.GetVolume(volume) 53 | if err != nil { 54 | return err 55 | } 56 | bmds, err := createBlockMetadata(mds, vol.Name, torus.VolumeID(vol.Id)) 57 | if err != nil { 58 | return err 59 | } 60 | return bmds.DeleteVolume() 61 | } 62 | 63 | func (s *BlockVolume) SaveSnapshot(name string) error { return s.mds.SaveSnapshot(name) } 64 | func (s *BlockVolume) GetSnapshots() ([]Snapshot, error) { return s.mds.GetSnapshots() } 65 | func (s *BlockVolume) DeleteSnapshot(name string) error { return s.mds.DeleteSnapshot(name) } 66 | 67 | func (s *BlockVolume) getContext() context.Context { 68 | return context.TODO() 69 | } 70 | 71 | func (s *BlockVolume) getOrCreateBlockINode(ref torus.INodeRef) (*models.INode, error) { 72 | if ref.Volume() != torus.VolumeID(s.volume.Id) { 73 | panic("ids managed by metadata didn't match, how is that possible?") 74 | } 75 | if ref.INode != 1 { 76 | return s.srv.INodes.GetINode(s.getContext(), ref) 77 | } 78 | globals := s.mds.GlobalMetadata() 79 | bs, err := blockset.CreateBlocksetFromSpec(globals.DefaultBlockSpec, nil) 80 | if err != nil { 81 | return nil, err 82 | } 83 | nBlocks := (s.volume.MaxBytes / globals.BlockSize) 84 | if s.volume.MaxBytes%globals.BlockSize != 0 { 85 | nBlocks++ 86 | } 87 | err = bs.Truncate(int(nBlocks), globals.BlockSize) 88 | if err != nil { 89 | return nil, err 90 | } 91 | inode := models.NewEmptyINode() 92 | inode.INode = 1 93 | inode.Volume = s.volume.Id 94 | inode.Filesize = s.volume.MaxBytes 95 | inode.Blocks, err = torus.MarshalBlocksetToProto(bs) 96 | return inode, err 97 | } 98 | -------------------------------------------------------------------------------- /blockset.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "github.com/RoaringBitmap/roaring" 5 | "github.com/coreos/torus/models" 6 | "golang.org/x/net/context" 7 | ) 8 | 9 | // Blockset is the interface representing the standardized methods to interact 10 | // with a set of blocks. 11 | type Blockset interface { 12 | // Length returns the number of blocks in the Blockset. 13 | Length() int 14 | // Kind returns the kind of the Blockset. 15 | Kind() uint32 16 | // GetBlock returns the ith block in the Blockset. 17 | GetBlock(ctx context.Context, i int) ([]byte, error) 18 | // PutBlock puts a block with data `b` into the Blockset as its ith block. 19 | // The block belongs to the given inode. 20 | PutBlock(ctx context.Context, inode INodeRef, i int, b []byte) error 21 | // GetLiveInodes returns the current INode representation of the Blockset. 22 | // The returned INode might not be synced. 23 | GetLiveINodes() *roaring.Bitmap 24 | // GetAllBlockRefs returns the BlockRef of the blocks in the Blockset. 25 | // The ith BlockRef in the returned slice is the Ref of the ith Block in the 26 | // Blockset. 27 | GetAllBlockRefs() []BlockRef 28 | 29 | // Marshal returns the bytes representation of the Blockset. 30 | Marshal() ([]byte, error) 31 | // Unmarshal parses the bytes representation of the Blockset and stores the result 32 | // in the Blockset. 33 | Unmarshal(data []byte) error 34 | // GetSubBlockset gets the sub-Blockset of the Blockset if exists. 35 | // If there is no sub-Blockset, nil will be returned. 36 | GetSubBlockset() Blockset 37 | // Truncate changes the length of the Blockset and the block. If the Blockset has less 38 | // blocks than the required size, truncate adds zero blocks. If the block has less bytes 39 | // than required size, truncate add bytes into block. 40 | Truncate(lastIndex int, blocksize uint64) error 41 | // Trim zeros the blocks in range [from, to). 42 | Trim(from, to int) error 43 | // String implements the fmt.Stringer interface. 44 | String() string 45 | } 46 | 47 | type BlockLayerKind int 48 | 49 | type BlockLayer struct { 50 | Kind BlockLayerKind 51 | Options string 52 | } 53 | 54 | type BlockLayerSpec []BlockLayer 55 | 56 | func MarshalBlocksetToProto(bs Blockset) ([]*models.BlockLayer, error) { 57 | var out []*models.BlockLayer 58 | var layer Blockset 59 | for layer = bs; layer != nil; layer = layer.GetSubBlockset() { 60 | m, err := layer.Marshal() 61 | if err != nil { 62 | return nil, err 63 | } 64 | out = append(out, &models.BlockLayer{ 65 | Type: layer.Kind(), 66 | Content: m, 67 | }) 68 | } 69 | return out, nil 70 | } 71 | -------------------------------------------------------------------------------- /blockset/base.go: -------------------------------------------------------------------------------- 1 | package blockset 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "golang.org/x/net/context" 7 | 8 | "github.com/RoaringBitmap/roaring" 9 | "github.com/coreos/pkg/capnslog" 10 | "github.com/coreos/torus" 11 | ) 12 | 13 | type baseBlockset struct { 14 | ids uint64 15 | blocks []torus.BlockRef 16 | store torus.BlockStore 17 | blocksize uint64 18 | } 19 | 20 | var _ blockset = &baseBlockset{} 21 | 22 | func init() { 23 | RegisterBlockset(Base, func(_ string, store torus.BlockStore, _ blockset) (blockset, error) { 24 | return newBaseBlockset(store), nil 25 | }) 26 | } 27 | 28 | func newBaseBlockset(store torus.BlockStore) *baseBlockset { 29 | b := &baseBlockset{ 30 | blocks: make([]torus.BlockRef, 0), 31 | store: store, 32 | } 33 | if store != nil { 34 | b.blocksize = store.BlockSize() 35 | } 36 | return b 37 | } 38 | 39 | func (b *baseBlockset) Length() int { 40 | return len(b.blocks) 41 | } 42 | 43 | func (b *baseBlockset) Kind() uint32 { 44 | return uint32(Base) 45 | } 46 | 47 | func (b *baseBlockset) GetBlock(ctx context.Context, i int) ([]byte, error) { 48 | if i >= len(b.blocks) { 49 | return nil, torus.ErrBlockNotExist 50 | } 51 | if b.blocks[i].IsZero() { 52 | return make([]byte, b.store.BlockSize()), nil 53 | } 54 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 55 | torus.BlockLog.Tracef("base: getting block %d at BlockID %s", i, b.blocks[i]) 56 | } 57 | bytes, err := b.store.GetBlock(ctx, b.blocks[i]) 58 | if err != nil { 59 | promBaseFail.Inc() 60 | return nil, err 61 | } 62 | return bytes, err 63 | } 64 | 65 | func (b *baseBlockset) PutBlock(ctx context.Context, inode torus.INodeRef, i int, data []byte) error { 66 | if i > len(b.blocks) { 67 | return torus.ErrBlockNotExist 68 | } 69 | // if v, ok := ctx.Value("isEmpty").(bool); ok && v { 70 | // clog.Debug("copying empty block") 71 | // if i == len(b.blocks) { 72 | // b.blocks = append(b.blocks, torus.ZeroBlock()) 73 | // } else { 74 | // b.blocks[i] = torus.ZeroBlock() 75 | // } 76 | // return nil 77 | // } 78 | newBlockID := b.makeID(inode) 79 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 80 | torus.BlockLog.Tracef("base: writing block %d at BlockID %s", i, newBlockID) 81 | } 82 | err := b.store.WriteBlock(ctx, newBlockID, data) 83 | if err != nil { 84 | return err 85 | } 86 | if i == len(b.blocks) { 87 | b.blocks = append(b.blocks, newBlockID) 88 | } else { 89 | b.blocks[i] = newBlockID 90 | } 91 | return nil 92 | } 93 | 94 | func (b *baseBlockset) makeID(i torus.INodeRef) torus.BlockRef { 95 | id := atomic.AddUint64(&b.ids, 1) 96 | return torus.BlockRef{ 97 | INodeRef: i, 98 | Index: torus.IndexID(id), 99 | } 100 | } 101 | 102 | func (b *baseBlockset) Marshal() ([]byte, error) { 103 | buf := make([]byte, len(b.blocks)*torus.BlockRefByteSize) 104 | for i, x := range b.blocks { 105 | x.ToBytesBuf(buf[(i * torus.BlockRefByteSize) : (i+1)*torus.BlockRefByteSize]) 106 | } 107 | return buf, nil 108 | } 109 | 110 | func (b *baseBlockset) setStore(s torus.BlockStore) { 111 | b.blocksize = s.BlockSize() 112 | b.store = s 113 | } 114 | 115 | func (b *baseBlockset) getStore() torus.BlockStore { 116 | return b.store 117 | } 118 | 119 | func (b *baseBlockset) Unmarshal(data []byte) error { 120 | l := len(data) / torus.BlockRefByteSize 121 | out := make([]torus.BlockRef, l) 122 | for i := 0; i < l; i++ { 123 | out[i] = torus.BlockRefFromBytes(data[(i * torus.BlockRefByteSize) : (i+1)*torus.BlockRefByteSize]) 124 | } 125 | b.blocks = out 126 | return nil 127 | } 128 | 129 | func (b *baseBlockset) GetSubBlockset() torus.Blockset { return nil } 130 | 131 | func (b *baseBlockset) GetLiveINodes() *roaring.Bitmap { 132 | out := roaring.NewBitmap() 133 | for _, blk := range b.blocks { 134 | if blk.IsZero() { 135 | continue 136 | } 137 | out.Add(uint32(blk.INode)) 138 | } 139 | return out 140 | } 141 | 142 | func (b *baseBlockset) Truncate(lastIndex int, _ uint64) error { 143 | if lastIndex <= len(b.blocks) { 144 | b.blocks = b.blocks[:lastIndex] 145 | return nil 146 | } 147 | toadd := lastIndex - len(b.blocks) 148 | for toadd != 0 { 149 | b.blocks = append(b.blocks, torus.ZeroBlock()) 150 | toadd-- 151 | } 152 | return nil 153 | } 154 | 155 | func (b *baseBlockset) Trim(from, to int) error { 156 | if from >= len(b.blocks) { 157 | return nil 158 | } 159 | if to > len(b.blocks) { 160 | to = len(b.blocks) 161 | } 162 | for i := from; i < to; i++ { 163 | b.blocks[i] = torus.ZeroBlock() 164 | } 165 | return nil 166 | } 167 | 168 | func (b *baseBlockset) GetAllBlockRefs() []torus.BlockRef { 169 | out := make([]torus.BlockRef, len(b.blocks)) 170 | copy(out, b.blocks) 171 | return out 172 | } 173 | 174 | func (b *baseBlockset) String() string { 175 | out := "[\n" 176 | for _, x := range b.blocks { 177 | out += x.String() + "\n" 178 | } 179 | out += "]" 180 | return out 181 | } 182 | -------------------------------------------------------------------------------- /blockset/base_test.go: -------------------------------------------------------------------------------- 1 | package blockset 2 | 3 | import ( 4 | "testing" 5 | 6 | "golang.org/x/net/context" 7 | 8 | "github.com/coreos/torus" 9 | 10 | // Register storage drivers. 11 | _ "github.com/coreos/torus/storage" 12 | ) 13 | 14 | type makeTestBlockset func(s torus.BlockStore) blockset 15 | 16 | func TestBaseReadWrite(t *testing.T) { 17 | s, _ := torus.CreateBlockStore("temp", "test", torus.Config{StorageSize: 300 * 1024}, torus.GlobalMetadata{BlockSize: 1024}) 18 | b := newBaseBlockset(s) 19 | readWriteTest(t, b) 20 | } 21 | 22 | func readWriteTest(t *testing.T, b blockset) { 23 | inode := torus.NewINodeRef(1, 1) 24 | b.PutBlock(context.TODO(), inode, 0, []byte("Some data")) 25 | inode = torus.NewINodeRef(1, 2) 26 | b.PutBlock(context.TODO(), inode, 1, []byte("Some more data")) 27 | data, err := b.GetBlock(context.TODO(), 0) 28 | if err != nil { 29 | t.Fatal(err) 30 | } 31 | if string(data) != "Some data" { 32 | t.Error("data not retrieved") 33 | } 34 | b.PutBlock(context.TODO(), inode, 0, []byte("Some different data")) 35 | data, err = b.GetBlock(context.TODO(), 0) 36 | if err != nil { 37 | t.Fatal(err) 38 | } 39 | if string(data) != "Some different data" { 40 | t.Error("data not retrieved") 41 | } 42 | } 43 | 44 | func TestBaseMarshal(t *testing.T) { 45 | s, _ := torus.CreateBlockStore("temp", "test", torus.Config{StorageSize: 300 * 1024}, torus.GlobalMetadata{BlockSize: 1024}) 46 | marshalTest(t, s, MustParseBlockLayerSpec("base")) 47 | } 48 | 49 | func marshalTest(t *testing.T, s torus.BlockStore, spec torus.BlockLayerSpec) { 50 | b, err := CreateBlocksetFromSpec(spec, s) 51 | if err != nil { 52 | t.Fatal(err) 53 | } 54 | inode := torus.NewINodeRef(1, 1) 55 | b.PutBlock(context.TODO(), inode, 0, []byte("Some data")) 56 | marshal, err := torus.MarshalBlocksetToProto(b) 57 | if err != nil { 58 | t.Fatal(err) 59 | } 60 | 61 | b = nil 62 | newb, err := UnmarshalFromProto(marshal, s) 63 | if err != nil { 64 | t.Fatal(err) 65 | } 66 | 67 | data, err := newb.GetBlock(context.TODO(), 0) 68 | if err != nil { 69 | t.Fatal(err) 70 | } 71 | if string(data) != "Some data" { 72 | t.Error("data not retrieved") 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /blockset/crc_test.go: -------------------------------------------------------------------------------- 1 | package blockset 2 | 3 | import ( 4 | "testing" 5 | 6 | "golang.org/x/net/context" 7 | 8 | "github.com/coreos/torus" 9 | 10 | // Register storage drivers. 11 | _ "github.com/coreos/torus/storage" 12 | ) 13 | 14 | func TestCRCReadWrite(t *testing.T) { 15 | s, _ := torus.CreateBlockStore("temp", "test", torus.Config{StorageSize: 300 * 1024}, torus.GlobalMetadata{BlockSize: 1024}) 16 | b := newBaseBlockset(s) 17 | crc := newCRCBlockset(b) 18 | readWriteTest(t, crc) 19 | } 20 | 21 | func TestCRCMarshal(t *testing.T) { 22 | s, _ := torus.CreateBlockStore("temp", "test", torus.Config{StorageSize: 300 * 1024}, torus.GlobalMetadata{BlockSize: 1024}) 23 | marshalTest(t, s, MustParseBlockLayerSpec("crc,base")) 24 | } 25 | 26 | func TestCRCCorruption(t *testing.T) { 27 | s, _ := torus.CreateBlockStore("temp", "test", torus.Config{StorageSize: 300 * 1024}, torus.GlobalMetadata{BlockSize: 1024}) 28 | b := newBaseBlockset(s) 29 | crc := newCRCBlockset(b) 30 | inode := torus.NewINodeRef(1, 1) 31 | crc.PutBlock(context.TODO(), inode, 0, []byte("Some data")) 32 | s.WriteBlock(context.TODO(), b.blocks[0], []byte("Evil Corruption!!")) 33 | _, err := crc.GetBlock(context.TODO(), 0) 34 | if err != torus.ErrBlockUnavailable { 35 | t.Fatal("No corruption detection") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cliconfig/config.go: -------------------------------------------------------------------------------- 1 | package cliconfig 2 | 3 | type TorusConfig struct { 4 | EtcdConfig map[string]Etcd_config `json:"etcd_config"` 5 | } 6 | 7 | type Etcd_config struct { 8 | Etcd string `json:"etcd,omitempty"` 9 | EtcdCAFile string `json:"etcd-ca-file,omitempty"` 10 | EtcdCertFile string `json:"etcd-cert-file,omitempty"` 11 | EtcdKeyFile string `json:"etcd-key-file,omitempty"` 12 | } 13 | -------------------------------------------------------------------------------- /cmd/torusblk/aoe.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "os/signal" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/spf13/cobra" 12 | 13 | "github.com/coreos/torus" 14 | "github.com/coreos/torus/block" 15 | "github.com/coreos/torus/block/aoe" 16 | ) 17 | 18 | const ( 19 | flushDevice = "/dev/etherd/flush" 20 | ) 21 | 22 | var aoeCommand = &cobra.Command{ 23 | Use: "aoe VOLUME INTERFACE MAJOR MINOR", 24 | Short: "serve a volume over AoE", 25 | Long: strings.TrimSpace(` 26 | Serve a volume over AoE using the specified network interface and AoE 27 | major and minor addresses. 28 | 29 | It is important to note that all AoE servers on the same layer 2 network 30 | must have different major and minor addresses. 31 | 32 | An example of serving two volumes using AoE on the same server over the 33 | eth0 network interface: 34 | 35 | torusblk aoe vol01 eth0 1 1 36 | torusblk aoe vol02 eth0 1 2 37 | `), 38 | Run: func(cmd *cobra.Command, args []string) { 39 | err := aoeAction(cmd, args) 40 | if err == torus.ErrUsage { 41 | cmd.Usage() 42 | os.Exit(1) 43 | } else if err != nil { 44 | die("%v", err) 45 | } 46 | }, 47 | } 48 | 49 | var ( 50 | aoeFlush string 51 | ) 52 | 53 | func init() { 54 | aoeCommand.Flags().StringVarP(&aoeFlush, "flush", "", "", "flush AOE device (e.g. torsublk aoe --flush e1.1)") 55 | } 56 | 57 | func aoeAction(cmd *cobra.Command, args []string) error { 58 | if len(aoeFlush) > 0 && len(args) == 0 { 59 | if err := flush(aoeFlush); err != nil { 60 | return fmt.Errorf("failed to flush: %v", err) 61 | } 62 | return nil 63 | } 64 | 65 | if len(args) != 4 { 66 | return torus.ErrUsage 67 | } 68 | 69 | srv := createServer() 70 | defer srv.Close() 71 | 72 | vol := args[0] 73 | ifname := args[1] 74 | maj := args[2] 75 | min := args[3] 76 | 77 | major, err := strconv.ParseUint(maj, 10, 16) 78 | if err != nil { 79 | return fmt.Errorf("Failed to parse major address %q: %v", maj, err) 80 | } 81 | 82 | minor, err := strconv.ParseUint(min, 10, 8) 83 | if err != nil { 84 | return fmt.Errorf("Failed to parse minor address %q: %v", min, err) 85 | } 86 | 87 | blockvol, err := block.OpenBlockVolume(srv, vol) 88 | if err != nil { 89 | return fmt.Errorf("server doesn't support block volumes: %v", err) 90 | } 91 | 92 | ai, err := aoe.NewInterface(ifname) 93 | if err != nil { 94 | return fmt.Errorf("Failed to set up interface %q: %v", ifname, err) 95 | } 96 | 97 | as, err := aoe.NewServer(blockvol, &aoe.ServerOptions{ 98 | Major: uint16(major), 99 | Minor: uint8(minor), 100 | }) 101 | if err != nil { 102 | return fmt.Errorf("Failed to crate AoE server: %v", err) 103 | } 104 | defer as.Close() 105 | 106 | signalChan := make(chan os.Signal, 1) 107 | signal.Notify(signalChan, os.Interrupt) 108 | 109 | go func(iface *aoe.Interface) { 110 | for _ = range signalChan { 111 | fmt.Println("\nReceived an interrupt, stopping services...") 112 | iface.Close() 113 | } 114 | }(ai) 115 | 116 | if err = as.Serve(ai); err != nil { 117 | return fmt.Errorf("Failed to serve AoE: %v", err) 118 | } 119 | return nil 120 | } 121 | 122 | func flush(d string) error { 123 | fd, err := os.OpenFile(flushDevice, os.O_WRONLY|os.O_APPEND, 0644) 124 | if err != nil { 125 | return err 126 | } 127 | defer fd.Close() 128 | writer := bufio.NewWriter(fd) 129 | _, err = writer.WriteString(d) 130 | if err != nil { 131 | return err 132 | } 133 | writer.Flush() 134 | return nil 135 | } 136 | -------------------------------------------------------------------------------- /cmd/torusblk/completion.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/spf13/cobra" 7 | ) 8 | 9 | var completionCommand = &cobra.Command{ 10 | Use: "completion", 11 | Short: "Output bash completion code", 12 | Run: completionAction, 13 | } 14 | 15 | func completionAction(cmd *cobra.Command, args []string) { 16 | cmd.Root().GenBashCompletion(os.Stdout) 17 | } 18 | -------------------------------------------------------------------------------- /cmd/torusblk/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/spf13/cobra" 9 | 10 | "github.com/coreos/torus" 11 | "github.com/coreos/torus/distributor" 12 | "github.com/coreos/torus/internal/flagconfig" 13 | "github.com/coreos/torus/internal/http" 14 | 15 | // Register all the drivers. 16 | _ "github.com/coreos/torus/metadata/etcd" 17 | _ "github.com/coreos/torus/storage" 18 | ) 19 | 20 | var ( 21 | logpkg string 22 | httpAddr string 23 | cfg torus.Config 24 | 25 | debug bool 26 | ) 27 | 28 | var rootCommand = &cobra.Command{ 29 | Use: "torusblk", 30 | Short: "torus block volume tool", 31 | Long: "Control block volumes on the torus distributed storage system", 32 | PersistentPreRun: configureServer, 33 | Run: func(cmd *cobra.Command, args []string) { 34 | cmd.Usage() 35 | os.Exit(1) 36 | }, 37 | } 38 | 39 | var versionCommand = &cobra.Command{ 40 | Use: "version", 41 | Short: "print version", 42 | Run: func(cmd *cobra.Command, args []string) { 43 | fmt.Printf("torusblk\nVersion: %s\n", torus.Version) 44 | os.Exit(0) 45 | }, 46 | } 47 | 48 | func init() { 49 | rootCommand.AddCommand(aoeCommand) 50 | rootCommand.AddCommand(versionCommand) 51 | rootCommand.AddCommand(completionCommand) 52 | 53 | // Flexvolume commands 54 | rootCommand.AddCommand(initCommand) 55 | rootCommand.AddCommand(attachCommand) 56 | rootCommand.AddCommand(detachCommand) 57 | rootCommand.AddCommand(mountCommand) 58 | rootCommand.AddCommand(unmountCommand) 59 | rootCommand.AddCommand(flexprepvolCommand) 60 | 61 | rootCommand.PersistentFlags().StringVarP(&logpkg, "logpkg", "", "", "Specific package logging") 62 | rootCommand.PersistentFlags().StringVarP(&httpAddr, "http", "", "", "HTTP endpoint for debug and stats") 63 | rootCommand.PersistentFlags().BoolVarP(&debug, "debug", "", false, "Turn on debug output") 64 | flagconfig.AddConfigFlags(rootCommand.PersistentFlags()) 65 | } 66 | 67 | func configureServer(cmd *cobra.Command, args []string) { 68 | switch { 69 | case debug: 70 | capnslog.SetGlobalLogLevel(capnslog.DEBUG) 71 | default: 72 | capnslog.SetGlobalLogLevel(capnslog.INFO) 73 | } 74 | if logpkg != "" { 75 | capnslog.SetGlobalLogLevel(capnslog.NOTICE) 76 | rl := capnslog.MustRepoLogger("github.com/coreos/torus") 77 | llc, err := rl.ParseLogLevelConfig(logpkg) 78 | if err != nil { 79 | fmt.Fprintf(os.Stderr, "error parsing logpkg: %s\n", err) 80 | os.Exit(1) 81 | } 82 | rl.SetLogLevel(llc) 83 | } 84 | 85 | cfg = flagconfig.BuildConfigFromFlags() 86 | } 87 | 88 | func createServer() *torus.Server { 89 | srv, err := torus.NewServer(cfg, "etcd", "temp") 90 | if err != nil { 91 | fmt.Printf("Couldn't start: %s\n", err) 92 | os.Exit(1) 93 | } 94 | err = distributor.OpenReplication(srv) 95 | if err != nil { 96 | fmt.Printf("Couldn't start: %s", err) 97 | os.Exit(1) 98 | } 99 | if httpAddr != "" { 100 | go http.ServeHTTP(httpAddr, srv) 101 | } 102 | return srv 103 | } 104 | 105 | func main() { 106 | capnslog.SetGlobalLogLevel(capnslog.WARNING) 107 | 108 | if err := rootCommand.Execute(); err != nil { 109 | fmt.Fprintln(os.Stderr, err) 110 | os.Exit(1) 111 | } 112 | } 113 | 114 | func die(why string, args ...interface{}) { 115 | fmt.Fprintf(os.Stderr, why+"\n", args...) 116 | os.Exit(1) 117 | } 118 | -------------------------------------------------------------------------------- /cmd/torusblk/nbd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/signal" 7 | 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/block" 10 | "github.com/coreos/torus/internal/nbd" 11 | 12 | "github.com/spf13/cobra" 13 | ) 14 | 15 | var ( 16 | nbdCommand = &cobra.Command{ 17 | Use: "nbd VOLUME [NBD-DEV]", 18 | Short: "attach a block volume to an NBD device", 19 | Run: func(cmd *cobra.Command, args []string) { 20 | err := nbdAction(cmd, args) 21 | if err == torus.ErrUsage { 22 | cmd.Usage() 23 | os.Exit(1) 24 | } else if err != nil { 25 | die("%v", err) 26 | } 27 | }, 28 | } 29 | 30 | nbdServeCommand = &cobra.Command{ 31 | Use: "nbdserve", 32 | Short: "serve a block volume over the NBD protocol", 33 | Run: func(cmd *cobra.Command, args []string) { 34 | err := nbdServeAction(cmd, args) 35 | if err == torus.ErrUsage { 36 | cmd.Usage() 37 | os.Exit(1) 38 | } else if err != nil { 39 | die("%v", err) 40 | } 41 | }, 42 | } 43 | ) 44 | 45 | var ( 46 | serveListenAddress string 47 | detachDevice string 48 | ) 49 | 50 | func init() { 51 | rootCommand.AddCommand(nbdCommand) 52 | rootCommand.AddCommand(nbdServeCommand) 53 | 54 | nbdCommand.Flags().StringVarP(&detachDevice, "detach", "d", "", "detach an NBD device from a block volume. (e.g. torsublk nbd -d /dev/nbd0)") 55 | nbdServeCommand.Flags().StringVarP(&serveListenAddress, "listen", "l", "0.0.0.0:10809", "nbd server listen address") 56 | } 57 | 58 | func nbdAction(cmd *cobra.Command, args []string) error { 59 | if len(detachDevice) > 0 && len(args) == 0 { 60 | if err := nbd.Detach(detachDevice); err != nil { 61 | return fmt.Errorf("failed to detach: %v", err) 62 | } 63 | return nil 64 | } 65 | 66 | if len(args) != 1 && len(args) != 2 { 67 | return torus.ErrUsage 68 | } 69 | 70 | var knownDev string 71 | if len(args) == 2 { 72 | knownDev = args[1] 73 | } 74 | 75 | srv := createServer() 76 | 77 | signalChan := make(chan os.Signal, 1) 78 | signal.Notify(signalChan, os.Interrupt) 79 | 80 | closer := make(chan bool) 81 | go func() { 82 | for _ = range signalChan { 83 | fmt.Println("\nReceived an interrupt, disconnecting...") 84 | close(closer) 85 | } 86 | }() 87 | defer srv.Close() 88 | blockvol, err := block.OpenBlockVolume(srv, args[0]) 89 | if err != nil { 90 | return fmt.Errorf("server doesn't support block volumes: %s", err) 91 | } 92 | 93 | f, err := blockvol.OpenBlockFile() 94 | if err != nil { 95 | if err == torus.ErrLocked { 96 | return fmt.Errorf("volume %s is already mounted on another host", args[0]) 97 | } 98 | return fmt.Errorf("can't open block volume: %s", err) 99 | } 100 | defer f.Close() 101 | err = connectNBD(srv, f, knownDev, closer) 102 | if err != nil { 103 | return err 104 | } 105 | return nil 106 | } 107 | 108 | func connectNBD(srv *torus.Server, f *block.BlockFile, target string, closer chan bool) error { 109 | defer f.Close() 110 | size := f.Size() 111 | 112 | gmd := srv.MDS.GlobalMetadata() 113 | 114 | handle := nbd.Create(f, int64(size), int64(gmd.BlockSize)) 115 | 116 | if target == "" { 117 | t, err := nbd.FindDevice() 118 | if err != nil { 119 | return err 120 | } 121 | target = t 122 | } 123 | 124 | _, err := handle.OpenDevice(target) 125 | if err != nil { 126 | return err 127 | } 128 | 129 | go func(n *nbd.NBD) { 130 | <-closer 131 | n.Disconnect() 132 | }(handle) 133 | 134 | err = handle.Serve() 135 | if err != nil { 136 | return fmt.Errorf("error from nbd server: %s", err) 137 | } 138 | return nil 139 | } 140 | 141 | type finder struct { 142 | srv *torus.Server 143 | } 144 | 145 | func (f *finder) FindDevice(name string) (nbd.Device, error) { 146 | blockvol, err := block.OpenBlockVolume(f.srv, name) 147 | if err != nil { 148 | return nil, err 149 | } 150 | 151 | return blockvol.OpenBlockFile() 152 | } 153 | 154 | func (f *finder) ListDevices() ([]string, error) { 155 | vols, _, err := f.srv.MDS.GetVolumes() 156 | if err != nil { 157 | return nil, err 158 | } 159 | 160 | var volnames []string 161 | 162 | for _, v := range vols { 163 | volnames = append(volnames, v.Name) 164 | } 165 | 166 | return volnames, nil 167 | } 168 | 169 | func nbdServeAction(cmd *cobra.Command, args []string) error { 170 | if len(args) != 0 { 171 | return torus.ErrUsage 172 | } 173 | 174 | srv := createServer() 175 | defer srv.Close() 176 | 177 | signalChan := make(chan os.Signal, 1) 178 | signal.Notify(signalChan, os.Interrupt) 179 | 180 | devfinder := &finder{srv} 181 | server, err := nbd.NewNBDServer(serveListenAddress, devfinder) 182 | if err != nil { 183 | return fmt.Errorf("can't start server: %v", err) 184 | } 185 | 186 | // TODO: sync all conns 187 | go func() { 188 | <-signalChan 189 | server.Close() 190 | return 191 | }() 192 | 193 | if err := server.Serve(); err != nil { 194 | return fmt.Errorf("server exited: %v", err) 195 | } 196 | return nil 197 | } 198 | -------------------------------------------------------------------------------- /cmd/torusblk/tcmu.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "os" 8 | "os/signal" 9 | 10 | "github.com/coreos/torus" 11 | "github.com/coreos/torus/block" 12 | "github.com/spf13/cobra" 13 | 14 | "github.com/coreos/torus/internal/tcmu" 15 | ) 16 | 17 | var ( 18 | tcmuCommand = &cobra.Command{ 19 | Use: "tcmu VOLUME", 20 | Short: "attach a torus block volume via SCSI", 21 | Run: func(cmd *cobra.Command, args []string) { 22 | err := tcmuAction(cmd, args) 23 | if err == torus.ErrUsage { 24 | cmd.Usage() 25 | os.Exit(1) 26 | } else if err != nil { 27 | die("%v", err) 28 | } 29 | }, 30 | } 31 | ) 32 | 33 | func init() { 34 | rootCommand.AddCommand(tcmuCommand) 35 | } 36 | 37 | func tcmuAction(cmd *cobra.Command, args []string) error { 38 | if len(args) != 1 { 39 | return torus.ErrUsage 40 | } 41 | 42 | srv := createServer() 43 | 44 | signalChan := make(chan os.Signal, 1) 45 | signal.Notify(signalChan, os.Interrupt) 46 | 47 | closer := make(chan bool) 48 | go func() { 49 | for range signalChan { 50 | fmt.Println("\nReceived an interrupt, disconnecting...") 51 | close(closer) 52 | } 53 | }() 54 | defer srv.Close() 55 | blockvol, err := block.OpenBlockVolume(srv, args[0]) 56 | if err != nil { 57 | return fmt.Errorf("server doesn't support block volumes: %s", err) 58 | } 59 | 60 | f, err := blockvol.OpenBlockFile() 61 | if err != nil { 62 | if err == torus.ErrLocked { 63 | return fmt.Errorf("volume %s is already mounted on another host", args[0]) 64 | } 65 | return fmt.Errorf("can't open block volume: %s", err) 66 | } 67 | defer f.Close() 68 | err = torustcmu.ConnectAndServe(f, args[0], closer) 69 | if err != nil { 70 | return fmt.Errorf("failed to serve volume using SCSI: %s", err) 71 | } 72 | return nil 73 | } 74 | -------------------------------------------------------------------------------- /cmd/torusctl/block.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/coreos/torus/internal/flagconfig" 7 | "github.com/spf13/cobra" 8 | ) 9 | 10 | var blockCommand = &cobra.Command{ 11 | Use: "block", 12 | Short: "interact with block volumes", 13 | Run: blockAction, 14 | } 15 | 16 | var blockCreateCommand = &cobra.Command{ 17 | Use: "create NAME SIZE", 18 | Short: "create a block volume in the cluster", 19 | Long: "creates a block volume named NAME of size SIZE bytes (G,GiB,M,MiB,etc suffixes accepted)", 20 | Run: volumeCreateBlockAction, 21 | } 22 | 23 | func init() { 24 | blockCommand.AddCommand(blockCreateCommand) 25 | flagconfig.AddConfigFlags(blockCommand.PersistentFlags()) 26 | } 27 | 28 | func blockAction(cmd *cobra.Command, args []string) { 29 | cmd.Usage() 30 | os.Exit(1) 31 | } 32 | -------------------------------------------------------------------------------- /cmd/torusctl/common.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/coreos/torus" 8 | "github.com/coreos/torus/distributor" 9 | "github.com/coreos/torus/internal/flagconfig" 10 | 11 | // Register all the drivers. 12 | _ "github.com/coreos/torus/metadata/etcd" 13 | _ "github.com/coreos/torus/storage" 14 | 15 | "github.com/dustin/go-humanize" 16 | ) 17 | 18 | func die(why string, args ...interface{}) { 19 | fmt.Fprintf(os.Stderr, why+"\n", args...) 20 | os.Exit(1) 21 | } 22 | 23 | func mustConnectToMDS() torus.MetadataService { 24 | cfg := flagconfig.BuildConfigFromFlags() 25 | mds, err := torus.CreateMetadataService("etcd", cfg) 26 | if err != nil { 27 | die("couldn't connect to etcd: %v", err) 28 | } 29 | return mds 30 | } 31 | 32 | func createServer() *torus.Server { 33 | cfg := flagconfig.BuildConfigFromFlags() 34 | srv, err := torus.NewServer(cfg, "etcd", "temp") 35 | if err != nil { 36 | die("Couldn't start: %s\n", err) 37 | } 38 | err = distributor.OpenReplication(srv) 39 | if err != nil { 40 | die("Couldn't start: %s", err) 41 | } 42 | return srv 43 | } 44 | 45 | func bytesOrIbytes(s uint64, si bool) string { 46 | if si { 47 | return humanize.Bytes(s) 48 | } 49 | return humanize.IBytes(s) 50 | } 51 | -------------------------------------------------------------------------------- /cmd/torusctl/completion.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/spf13/cobra" 7 | ) 8 | 9 | var completionCommand = &cobra.Command{ 10 | Use: "completion", 11 | Short: "Output bash completion code", 12 | Run: completionAction, 13 | } 14 | 15 | func completionAction(cmd *cobra.Command, args []string) { 16 | cmd.Root().GenBashCompletion(os.Stdout) 17 | } 18 | -------------------------------------------------------------------------------- /cmd/torusctl/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "os/user" 9 | "path/filepath" 10 | 11 | cli "github.com/coreos/torus/cliconfig" 12 | "github.com/coreos/torus/internal/flagconfig" 13 | "github.com/spf13/cobra" 14 | 15 | _ "github.com/coreos/torus/metadata/etcd" 16 | ) 17 | 18 | const ( 19 | defaultTorusConfigDir = "/.torus" 20 | defaultTorusConfigFile = "/config.json" 21 | ) 22 | 23 | var ( 24 | etcdAddress string 25 | etcdCertFile string 26 | etcdKeyFile string 27 | etcdCAFile string 28 | config string 29 | profile string 30 | view bool 31 | ) 32 | 33 | var configCommand = &cobra.Command{ 34 | Use: "config", 35 | Short: "Write config file for torus commands", 36 | Run: configAction, 37 | } 38 | 39 | func init() { 40 | configCommand.Flags().StringVarP(&etcdAddress, "etcd", "C", "127.0.0.1:2379", "Address for talking to etcd") 41 | configCommand.Flags().StringVarP(&etcdCertFile, "etcd-cert-file", "", "", "Certificate to use to authenticate against etcd") 42 | configCommand.Flags().StringVarP(&etcdKeyFile, "etcd-key-file", "", "", "Key for Certificate") 43 | configCommand.Flags().StringVarP(&etcdCAFile, "etcd-ca-file", "", "", "CA to authenticate etcd against") 44 | configCommand.Flags().StringVarP(&config, "config", "", "", "path to torus config file") 45 | configCommand.Flags().StringVarP(&profile, "profile", "", "default", "profile to use in cli config file") 46 | configCommand.Flags().BoolVar(&view, "view", false, "view torus configuration and exit") 47 | 48 | } 49 | 50 | func configAction(cmd *cobra.Command, args []string) { 51 | var err error 52 | etcdCfg := cli.Etcd_config{ 53 | Etcd: etcdAddress, 54 | EtcdCertFile: etcdCertFile, 55 | EtcdKeyFile: etcdKeyFile, 56 | EtcdCAFile: etcdCAFile, 57 | } 58 | 59 | if config == "" { 60 | usr, err := user.Current() 61 | if err != nil { 62 | die("error getting user info: %v", err) 63 | } 64 | _ = os.Mkdir(usr.HomeDir+defaultTorusConfigDir, 0700) 65 | config = filepath.Join(usr.HomeDir + defaultTorusConfigDir + defaultTorusConfigFile) 66 | } 67 | 68 | if view { 69 | bdata, err := ioutil.ReadFile(config) 70 | if err != nil { 71 | die("error reading config %s: %v", config, err) 72 | } 73 | fmt.Println(string(bdata)) 74 | os.Exit(0) 75 | } 76 | 77 | var c *cli.TorusConfig 78 | if _, err = os.Stat(config); err != nil { 79 | c = &cli.TorusConfig{EtcdConfig: map[string]cli.Etcd_config{}} 80 | } else { 81 | c, err = flagconfig.LoadConfigFile(config) 82 | if err != nil { 83 | die("error loading config file: %v", err) 84 | } 85 | } 86 | 87 | c.EtcdConfig[profile] = etcdCfg 88 | prettyJSON, err := json.MarshalIndent(c, "", "\t") 89 | if err != nil { 90 | die("error parsing config file: %v", err) 91 | } 92 | 93 | err = ioutil.WriteFile(config, prettyJSON, 0600) 94 | if err != nil { 95 | die("error writing %s: %v", config, err) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /cmd/torusctl/init.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/blockset" 10 | "github.com/coreos/torus/internal/flagconfig" 11 | "github.com/coreos/torus/ring" 12 | "github.com/dustin/go-humanize" 13 | "github.com/spf13/cobra" 14 | 15 | _ "github.com/coreos/torus/metadata/etcd" 16 | ) 17 | 18 | var ( 19 | blockSize uint64 20 | blockSizeStr string 21 | blockSpec string 22 | noMakeRing bool 23 | metaView bool 24 | ) 25 | 26 | var initCommand = &cobra.Command{ 27 | Use: "init", 28 | Short: "Prepare a new torus cluster by creating the metadata", 29 | PreRun: initPreRun, 30 | Run: initAction, 31 | } 32 | 33 | func init() { 34 | initCommand.Flags().StringVarP(&blockSizeStr, "block-size", "", "512KiB", "size of all data blocks in this storage cluster") 35 | initCommand.Flags().StringVarP(&blockSpec, "block-spec", "", "crc", "default replication/error correction applied to blocks in this storage cluster") 36 | initCommand.Flags().BoolVar(&noMakeRing, "no-ring", false, "do not create the default ring as part of init") 37 | initCommand.Flags().BoolVar(&metaView, "view", false, "view metadata configured in this storage cluster") 38 | } 39 | 40 | func initPreRun(cmd *cobra.Command, args []string) { 41 | if metaView { 42 | viewMetadata() 43 | os.Exit(0) 44 | } 45 | // We *always* need base. 46 | if !strings.HasSuffix(blockSpec, ",base") && !strings.HasPrefix(blockSpec, "base") { 47 | blockSpec += ",base" 48 | } 49 | var err error 50 | blockSize, err = humanize.ParseBytes(blockSizeStr) 51 | if err != nil { 52 | die("error parsing block-size: %v", err) 53 | } 54 | } 55 | 56 | func initAction(cmd *cobra.Command, args []string) { 57 | var err error 58 | md := torus.GlobalMetadata{} 59 | md.BlockSize = blockSize 60 | md.DefaultBlockSpec, err = blockset.ParseBlockLayerSpec(blockSpec) 61 | if err != nil { 62 | die("error parsing block-spec: %v", err) 63 | } 64 | 65 | cfg := flagconfig.BuildConfigFromFlags() 66 | ringType := ring.Ketama 67 | if noMakeRing { 68 | ringType = ring.Empty 69 | } 70 | err = torus.InitMDS("etcd", cfg, md, ringType) 71 | if err != nil { 72 | die("error writing metadata: %v", err) 73 | } 74 | } 75 | 76 | func viewMetadata() { 77 | mds := mustConnectToMDS() 78 | md := mds.GlobalMetadata() 79 | 80 | var blockSpecToStrings = []string{ 81 | blockset.Base: "base", 82 | blockset.CRC: "crc", 83 | blockset.Replication: "rep", 84 | } 85 | blockSpec := "" 86 | for _, x := range md.DefaultBlockSpec { 87 | blockSpec += blockSpecToStrings[x.Kind] + " " 88 | } 89 | fmt.Printf("Block size: %d byte\n", md.BlockSize) 90 | fmt.Printf("Block spec: %s\n", blockSpec) 91 | } 92 | -------------------------------------------------------------------------------- /cmd/torusctl/list-peers.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "time" 7 | 8 | "github.com/dustin/go-humanize" 9 | "github.com/spf13/cobra" 10 | ) 11 | 12 | var ( 13 | outputAsCSV bool 14 | outputAsSI bool 15 | ) 16 | 17 | var listPeersCommand = &cobra.Command{ 18 | Use: "list-peers", 19 | Short: "show the active storage peers in the cluster", 20 | Run: listPeersAction, 21 | } 22 | 23 | func init() { 24 | listPeersCommand.Flags().BoolVarP(&outputAsCSV, "csv", "", false, "output as csv instead") 25 | listPeersCommand.Flags().BoolVarP(&outputAsSI, "si", "", false, "output sizes in powers of 1000") 26 | } 27 | 28 | func listPeersAction(cmd *cobra.Command, args []string) { 29 | var totalStorage uint64 30 | var usedStorage uint64 31 | 32 | mds := mustConnectToMDS() 33 | gmd := mds.GlobalMetadata() 34 | peers, err := mds.GetPeers() 35 | if err != nil { 36 | die("couldn't get peers: %v", err) 37 | } 38 | ring, err := mds.GetRing() 39 | if err != nil { 40 | die("couldn't get ring: %v", err) 41 | } 42 | members := ring.Members() 43 | table := NewTableWriter(os.Stdout) 44 | table.SetHeader([]string{"Address", "UUID", "Size", "Used", "Member", "Updated", "Reb/Rep Data"}) 45 | rebalancing := false 46 | for _, x := range peers { 47 | ringStatus := "Avail" 48 | if x.Address == "" { 49 | continue 50 | } 51 | if members.Has(x.UUID) { 52 | ringStatus = "OK" 53 | } 54 | table.Append([]string{ 55 | x.Address, 56 | x.UUID, 57 | bytesOrIbytes(x.TotalBlocks*gmd.BlockSize, outputAsSI), 58 | bytesOrIbytes(x.UsedBlocks*gmd.BlockSize, outputAsSI), 59 | ringStatus, 60 | humanize.Time(time.Unix(0, x.LastSeen)), 61 | bytesOrIbytes(x.RebalanceInfo.LastRebalanceBlocks*gmd.BlockSize*uint64(time.Second)/uint64(x.LastSeen+1-x.RebalanceInfo.LastRebalanceFinish), outputAsSI) + "/sec", 62 | }) 63 | if x.RebalanceInfo.Rebalancing { 64 | rebalancing = true 65 | } 66 | totalStorage += x.TotalBlocks * gmd.BlockSize 67 | usedStorage += x.UsedBlocks * gmd.BlockSize 68 | } 69 | 70 | for _, x := range members { 71 | ringStatus := "DOWN" 72 | ok := false 73 | for _, p := range peers { 74 | if p.UUID == x { 75 | ok = true 76 | break 77 | } 78 | } 79 | if ok { 80 | continue 81 | } 82 | table.Append([]string{ 83 | "", 84 | x, 85 | "???", 86 | "???", 87 | ringStatus, 88 | "Missing", 89 | "", 90 | }) 91 | } 92 | if outputAsCSV { 93 | table.RenderCSV() 94 | } else { 95 | table.Render() 96 | fmt.Printf("Balanced: %v Usage: %5.2f%%\n", !rebalancing, (float64(usedStorage) / float64(totalStorage) * 100.0)) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /cmd/torusctl/peer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/coreos/torus" 7 | "github.com/coreos/torus/models" 8 | "github.com/spf13/cobra" 9 | ) 10 | 11 | var ( 12 | newPeers torus.PeerInfoList 13 | allPeers bool 14 | force bool 15 | ) 16 | 17 | var peerCommand = &cobra.Command{ 18 | Use: "peer", 19 | Short: "add/remove peers from the cluster", 20 | Run: peerAction, 21 | } 22 | 23 | var peerListCommand = &cobra.Command{ 24 | Use: "list", 25 | Short: "list peers in the cluster", 26 | Run: listPeersAction, 27 | } 28 | 29 | var peerAddCommand = &cobra.Command{ 30 | Use: "add ADDRESS|UUID", 31 | Short: "add a peer to the cluster", 32 | PreRun: peerChangePreRun, 33 | Run: peerAddAction, 34 | } 35 | 36 | var peerRemoveCommand = &cobra.Command{ 37 | Use: "remove ADDRESS|UUID", 38 | Short: "remove a peer from the cluster", 39 | PreRun: peerChangePreRun, 40 | Run: peerRemoveAction, 41 | } 42 | 43 | func init() { 44 | peerCommand.AddCommand(peerAddCommand, peerRemoveCommand, peerListCommand) 45 | peerAddCommand.Flags().BoolVar(&allPeers, "all-peers", false, "add all peers") 46 | peerRemoveCommand.PersistentFlags().BoolVar(&force, "force", false, "force-remove a UUID") 47 | } 48 | 49 | func peerAction(cmd *cobra.Command, args []string) { 50 | cmd.Usage() 51 | os.Exit(1) 52 | } 53 | 54 | func peerChangePreRun(cmd *cobra.Command, args []string) { 55 | if allPeers && len(args) > 0 { 56 | die("can't have both --all-peers and a list of peers") 57 | } 58 | mds = mustConnectToMDS() 59 | peers, err := mds.GetPeers() 60 | if err != nil { 61 | die("couldn't get peer list: %v", err) 62 | } 63 | var out torus.PeerInfoList 64 | for _, arg := range args { 65 | found := false 66 | for _, p := range peers { 67 | if p.Address != "" { 68 | if p.Address == arg { 69 | out = out.Union(torus.PeerInfoList{p}) 70 | found = true 71 | } else if p.UUID == arg { 72 | out = out.Union(torus.PeerInfoList{p}) 73 | found = true 74 | } 75 | } 76 | } 77 | if !found { 78 | if !force { 79 | die("peer %s not currently healthy. To remove, use `--force`", arg) 80 | } 81 | out = out.Union(torus.PeerInfoList{&models.PeerInfo{ 82 | UUID: arg, 83 | }}) 84 | } 85 | } 86 | if allPeers { 87 | for _, p := range peers { 88 | if p.Address != "" { 89 | out = out.Union(torus.PeerInfoList{p}) 90 | } 91 | } 92 | } 93 | newPeers = out 94 | } 95 | 96 | func peerAddAction(cmd *cobra.Command, args []string) { 97 | if !allPeers && len(args) == 0 { 98 | die("need to specify one of peer's address, uuid or --all-peers") 99 | } 100 | if mds == nil { 101 | mds = mustConnectToMDS() 102 | } 103 | currentRing, err := mds.GetRing() 104 | if err != nil { 105 | die("couldn't get ring: %v", err) 106 | } 107 | var newRing torus.Ring 108 | if r, ok := currentRing.(torus.RingAdder); ok { 109 | newRing, err = r.AddPeers(newPeers) 110 | } else { 111 | die("current ring type cannot support adding") 112 | } 113 | if err != nil { 114 | die("couldn't add peer to ring: %v", err) 115 | } 116 | err = mds.SetRing(newRing) 117 | if err != nil { 118 | die("couldn't set new ring: %v", err) 119 | } 120 | } 121 | 122 | func peerRemoveAction(cmd *cobra.Command, args []string) { 123 | if mds == nil { 124 | mds = mustConnectToMDS() 125 | } 126 | currentRing, err := mds.GetRing() 127 | if err != nil { 128 | die("couldn't get ring: %v", err) 129 | } 130 | var newRing torus.Ring 131 | if r, ok := currentRing.(torus.RingRemover); ok { 132 | newRing, err = r.RemovePeers(newPeers.PeerList()) 133 | } else { 134 | die("current ring type cannot support removal") 135 | } 136 | if err != nil { 137 | die("couldn't remove peer from ring: %v", err) 138 | } 139 | err = mds.SetRing(newRing) 140 | if err != nil { 141 | die("couldn't set new ring: %v", err) 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /cmd/torusctl/table.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "io" 6 | "strings" 7 | "text/tabwriter" 8 | ) 9 | 10 | type TableWriter struct { 11 | output io.Writer 12 | rows [][]string 13 | header []string 14 | } 15 | 16 | func NewTableWriter(output io.Writer) *TableWriter { 17 | return &TableWriter{ 18 | output: output, 19 | } 20 | } 21 | 22 | func (t *TableWriter) Render() { 23 | tw := tabwriter.NewWriter(t.output, 5, 1, 2, ' ', 0) 24 | capsHeader := make([]string, len(t.header)) 25 | for i, x := range t.header { 26 | capsHeader[i] = strings.ToUpper(x) 27 | } 28 | tw.Write([]byte(strings.Join(capsHeader, "\t") + "\n")) 29 | for _, x := range t.rows { 30 | tw.Write([]byte(strings.Join(x, "\t") + "\n")) 31 | } 32 | tw.Flush() 33 | } 34 | 35 | func (t *TableWriter) RenderCSV() { 36 | cw := csv.NewWriter(t.output) 37 | cw.WriteAll(t.rows) 38 | cw.Flush() 39 | } 40 | 41 | func (t *TableWriter) Append(s []string) { 42 | t.rows = append(t.rows, s) 43 | } 44 | 45 | func (t *TableWriter) SetHeader(s []string) { 46 | t.header = s 47 | } 48 | 49 | // TODO(barakmich) TableWriter.SortBy? 50 | -------------------------------------------------------------------------------- /cmd/torusctl/torusctl.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/internal/flagconfig" 10 | "github.com/spf13/cobra" 11 | ) 12 | 13 | var ( 14 | debug bool 15 | ) 16 | 17 | var rootCommand = &cobra.Command{ 18 | Use: "torusctl", 19 | Short: "Administer the torus storage cluster", 20 | Long: `Admin utility for the torus distributed storage cluster.`, 21 | PersistentPreRun: configure, 22 | Run: func(cmd *cobra.Command, args []string) { 23 | cmd.Usage() 24 | os.Exit(1) 25 | }, 26 | } 27 | 28 | var versionCommand = &cobra.Command{ 29 | Use: "version", 30 | Short: "print version", 31 | Run: func(cmd *cobra.Command, args []string) { 32 | fmt.Printf("torusctl\nVersion: %s\n", torus.Version) 33 | os.Exit(0) 34 | }, 35 | } 36 | 37 | func init() { 38 | rootCommand.PersistentFlags().BoolVarP(&debug, "debug", "", false, "enable debug logging") 39 | rootCommand.AddCommand(initCommand) 40 | rootCommand.AddCommand(blockCommand) 41 | rootCommand.AddCommand(listPeersCommand) 42 | rootCommand.AddCommand(ringCommand) 43 | rootCommand.AddCommand(peerCommand) 44 | rootCommand.AddCommand(volumeCommand) 45 | rootCommand.AddCommand(versionCommand) 46 | rootCommand.AddCommand(wipeCommand) 47 | rootCommand.AddCommand(configCommand) 48 | rootCommand.AddCommand(completionCommand) 49 | flagconfig.AddConfigFlags(rootCommand.PersistentFlags()) 50 | } 51 | 52 | func main() { 53 | if err := rootCommand.Execute(); err != nil { 54 | die("%v", err) 55 | } 56 | } 57 | 58 | func configure(cmd *cobra.Command, args []string) { 59 | capnslog.SetGlobalLogLevel(capnslog.WARNING) 60 | 61 | if debug { 62 | capnslog.SetGlobalLogLevel(capnslog.DEBUG) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /cmd/torusctl/volume.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/coreos/torus/block" 7 | "github.com/dustin/go-humanize" 8 | "github.com/spf13/cobra" 9 | ) 10 | 11 | var volumeCommand = &cobra.Command{ 12 | Use: "volume", 13 | Short: "manage volumes in the cluster", 14 | Run: volumeAction, 15 | } 16 | 17 | var volumeDeleteCommand = &cobra.Command{ 18 | Use: "delete NAME", 19 | Short: "delete a volume in the cluster", 20 | Run: volumeDeleteAction, 21 | } 22 | 23 | var volumeListCommand = &cobra.Command{ 24 | Use: "list", 25 | Short: "list volumes in the cluster", 26 | Run: volumeListAction, 27 | } 28 | 29 | var volumeCreateBlockCommand = &cobra.Command{ 30 | Use: "create-block NAME SIZE", 31 | Short: "create a block volume in the cluster", 32 | Long: "creates a block volume named NAME of size SIZE bytes (G,GiB,M,MiB,etc suffixes accepted)", 33 | Run: volumeCreateBlockAction, 34 | } 35 | 36 | func init() { 37 | volumeCommand.AddCommand(volumeDeleteCommand) 38 | volumeCommand.AddCommand(volumeListCommand) 39 | volumeCommand.AddCommand(volumeCreateBlockCommand) 40 | volumeListCommand.Flags().BoolVarP(&outputAsCSV, "csv", "", false, "output as csv instead") 41 | volumeListCommand.Flags().BoolVarP(&outputAsSI, "si", "", false, "output sizes in powers of 1000") 42 | } 43 | 44 | func volumeAction(cmd *cobra.Command, args []string) { 45 | cmd.Usage() 46 | os.Exit(1) 47 | } 48 | 49 | func volumeListAction(cmd *cobra.Command, args []string) { 50 | if len(args) != 0 { 51 | cmd.Usage() 52 | os.Exit(1) 53 | } 54 | mds := mustConnectToMDS() 55 | vols, _, err := mds.GetVolumes() 56 | if err != nil { 57 | die("error listing volumes: %v\n", err) 58 | } 59 | table := NewTableWriter(os.Stdout) 60 | table.SetHeader([]string{"Volume Name", "Size", "Type", "Status"}) 61 | for _, x := range vols { 62 | table.Append([]string{ 63 | x.Name, 64 | bytesOrIbytes(x.MaxBytes, outputAsSI), 65 | x.Type, 66 | mds.GetLockStatus(x.Id), 67 | }) 68 | } 69 | if outputAsCSV { 70 | table.RenderCSV() 71 | return 72 | } 73 | table.Render() 74 | } 75 | 76 | func volumeDeleteAction(cmd *cobra.Command, args []string) { 77 | if len(args) != 1 { 78 | cmd.Usage() 79 | os.Exit(1) 80 | } 81 | name := args[0] 82 | mds := mustConnectToMDS() 83 | vol, err := mds.GetVolume(name) 84 | if err != nil { 85 | die("cannot get volume %s (perhaps it doesn't exist): %v", name, err) 86 | } 87 | switch vol.Type { 88 | case "block": 89 | err = block.DeleteBlockVolume(mds, name) 90 | default: 91 | die("unknown volume type %s", vol.Type) 92 | } 93 | if err != nil { 94 | die("cannot delete volume: %v", err) 95 | } 96 | } 97 | 98 | func volumeCreateBlockAction(cmd *cobra.Command, args []string) { 99 | mds := mustConnectToMDS() 100 | if len(args) != 2 { 101 | cmd.Usage() 102 | os.Exit(1) 103 | } 104 | size, err := humanize.ParseBytes(args[1]) 105 | if err != nil { 106 | die("error parsing size %s: %v", args[1], err) 107 | } 108 | err = block.CreateBlockVolume(mds, args[0], size) 109 | if err != nil { 110 | die("error creating volume %s: %v", args[0], err) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /cmd/torusctl/wipe.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "strings" 8 | 9 | "github.com/spf13/cobra" 10 | 11 | "github.com/coreos/torus" 12 | "github.com/coreos/torus/internal/flagconfig" 13 | _ "github.com/coreos/torus/metadata/etcd" 14 | ) 15 | 16 | var ( 17 | yesIAmSurePleaseWipe bool 18 | ) 19 | var wipeCommand = &cobra.Command{ 20 | Use: "wipe", 21 | Short: "Remove all torus metadata from etcd", 22 | Run: wipeAction, 23 | } 24 | 25 | func init() { 26 | wipeCommand.Flags().BoolVarP(&yesIAmSurePleaseWipe, "yes-i-am-sure", "", false, "progamatically wipe everything from the metadata store") 27 | } 28 | 29 | func wipeAction(cmd *cobra.Command, args []string) { 30 | if !yesIAmSurePleaseWipe { 31 | reader := bufio.NewReader(os.Stdin) 32 | fmt.Println("This will wipe all metadata for torus.\nPlease type `YES`, all caps to confirm: ") 33 | text, _ := reader.ReadString('\n') 34 | text = strings.TrimSpace(text) 35 | if text != "YES" { 36 | fmt.Println("`YES` not entered, exiting") 37 | os.Exit(1) 38 | } 39 | } 40 | cfg := flagconfig.BuildConfigFromFlags() 41 | err := torus.WipeMDS("etcd", cfg) 42 | if err != nil { 43 | die("error wiping metadata: %v", err) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import "crypto/tls" 4 | 5 | type Config struct { 6 | DataDir string 7 | StorageSize uint64 8 | MetadataAddress string 9 | ReadCacheSize uint64 10 | ReadLevel ReadLevel 11 | WriteLevel WriteLevel 12 | 13 | TLS *tls.Config 14 | } 15 | -------------------------------------------------------------------------------- /contrib/kubernetes/postgres-oneshot.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | name: postgres-torus 6 | name: postgres-torus 7 | spec: 8 | type: NodePort 9 | ports: 10 | - port: 5432 11 | targetPort: postgres-client 12 | nodePort: 30432 13 | selector: 14 | app: postgres-torus 15 | --- 16 | apiVersion: extensions/v1beta1 17 | kind: Deployment 18 | metadata: 19 | name: postgres-torus 20 | labels: 21 | app: postgres-torus 22 | spec: 23 | replicas: 1 24 | template: 25 | metadata: 26 | labels: 27 | app: postgres-torus 28 | spec: 29 | containers: 30 | - image: postgres 31 | name: postgres 32 | ports: 33 | - name: postgres-client 34 | containerPort: 5432 35 | volumeMounts: 36 | - name: data 37 | mountPath: /var/lib/postgresql/data 38 | env: 39 | - name: POD_IP 40 | valueFrom: 41 | fieldRef: 42 | fieldPath: status.podIP 43 | - name: POSTGRES_PASSWORD 44 | value: testtorus 45 | - name: PGDATA 46 | value: "/var/lib/postgresql/data/pgdata" 47 | volumes: 48 | - name: data 49 | flexVolume: 50 | driver: "coreos.com/torus" 51 | fsType: "ext4" 52 | options: 53 | volume: "pg1" 54 | etcd: "10.3.0.100:2379" 55 | -------------------------------------------------------------------------------- /contrib/kubernetes/test-data.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE films ( 2 | code char(5) CONSTRAINT firstkey PRIMARY KEY, 3 | title varchar(40) NOT NULL, 4 | did integer NOT NULL, 5 | date_prod date, 6 | kind varchar(10), 7 | len interval hour to minute 8 | ); 9 | 10 | INSERT INTO films VALUES 11 | ('UA502', 'Bananas', 105, DEFAULT, 'Comedy', '82 minutes'); 12 | INSERT INTO films (code, title, did, date_prod, kind) 13 | VALUES ('T_601', 'Yojimbo', 106, DEFAULT, 'Drama'); 14 | -------------------------------------------------------------------------------- /contrib/kubernetes/torus-k8s-oneshot.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | name: etcd-torus 6 | name: etcd-torus 7 | spec: 8 | type: NodePort 9 | ports: 10 | - port: 2379 11 | name: etcd-client 12 | targetPort: etcd-client 13 | nodePort: 32379 14 | selector: 15 | name: etcd-torus 16 | --- 17 | apiVersion: v1 18 | kind: Service 19 | metadata: 20 | labels: 21 | name: etcd-torus-internal 22 | name: etcd-torus-internal 23 | spec: 24 | clusterIP: 10.3.0.100 25 | ports: 26 | - port: 2379 27 | name: etcd-client 28 | targetPort: etcd-client 29 | selector: 30 | name: etcd-torus 31 | --- 32 | apiVersion: v1 33 | kind: Pod 34 | metadata: 35 | labels: 36 | name: etcd-torus 37 | name: etcd-torus 38 | spec: 39 | containers: 40 | - image: quay.io/coreos/etcd:v3.0.0-beta.0 41 | name: etcd-torus 42 | ports: 43 | - name: etcd-peers 44 | containerPort: 2380 45 | - name: etcd-client 46 | containerPort: 2379 47 | volumeMounts: 48 | - name: data 49 | mountPath: /var/lib/etcd 50 | env: 51 | - name: POD_IP 52 | valueFrom: 53 | fieldRef: 54 | fieldPath: status.podIP 55 | - name: ETCD_DATA_DIR 56 | value: /var/lib/etcd 57 | - name: ETCD_NAME 58 | value: etcd 59 | - name: ETCD_INITIAL_CLUSTER 60 | value: etcd=http://$(POD_IP):2380 61 | - name: ETCD_INITIAL_ADVERTISE_PEER_URLS 62 | value: http://$(POD_IP):2380 63 | - name: ETCD_ADVERTISE_CLIENT_URLS 64 | value: http://$(POD_IP):2379 65 | - name: ETCD_LISTEN_CLIENT_URLS 66 | value: http://0.0.0.0:2379 67 | - name: ETCD_LISTEN_PEER_URLS 68 | value: http://$(POD_IP):2380 69 | volumes: 70 | - name: data 71 | emptyDir: {} 72 | --- 73 | apiVersion: extensions/v1beta1 74 | kind: DaemonSet 75 | metadata: 76 | name: torus 77 | labels: 78 | app: torus 79 | spec: 80 | template: 81 | metadata: 82 | name: torus 83 | labels: 84 | daemon: torus 85 | spec: 86 | containers: 87 | - name: torus 88 | image: quay.io/coreos/torus:latest 89 | ports: 90 | - name: peer 91 | containerPort: 40000 92 | - name: http 93 | containerPort: 4321 94 | env: 95 | - name: ETCD_HOST 96 | value: $(ETCD_TORUS_SERVICE_HOST) 97 | - name: STORAGE_SIZE 98 | value: 2GiB 99 | - name: LISTEN_HOST 100 | valueFrom: 101 | fieldRef: 102 | fieldPath: status.podIP 103 | - name: AUTO_JOIN 104 | value: "1" 105 | - name: DEBUG_INIT 106 | value: "1" 107 | - name: DROP_MOUNT_BIN 108 | value: "0" 109 | volumeMounts: 110 | - name: data 111 | mountPath: /data 112 | readOnly: false 113 | volumes: 114 | - name: data 115 | hostPath: 116 | path: /srv/torus 117 | imagePullSecrets: 118 | - name: quay-torus 119 | -------------------------------------------------------------------------------- /contrib/systemd/README.md: -------------------------------------------------------------------------------- 1 | # Run torusd with systemd 2 | 3 | ### Prerequisites 4 | 5 | We assume that `torusd` is installed to `/usr/bin/torusd`. 6 | 7 | ### Configuration 8 | 9 | #### 1) copy torusd.service to `/etc/systemd/system/torusd.service` 10 | 11 | ~~~ 12 | cp ./torusd.service /etc/systemd/system/torusd.service 13 | ~~~ 14 | 15 | #### 2) copy torusd to `/etc/sysconfig/torusd` 16 | 17 | ~~~ 18 | cp ./torusd /etc/systemd/system/torusd.service 19 | ~~~ 20 | 21 | #### 3) Edit `/etc/sysconfig/torusd` 22 | 23 | `--peer-address` and `--etcd` would be the mandatory configuration to use on your environment. 24 | 25 | ~~~ 26 | TORUSD_PEER_ADDRESS="--peer-address=http://:4000" 27 | TORUSD_ETCD="--etcd=:" 28 | ~~~ 29 | 30 | #### 4) Now you can control torusd with sytemd. 31 | 32 | ~~~ 33 | systemctl start torusd 34 | ~~~ 35 | -------------------------------------------------------------------------------- /contrib/systemd/torusd: -------------------------------------------------------------------------------- 1 | ## 2 | # torusd system config. Please refer to "torusd -h" for more details. 3 | ## 4 | 5 | # Address to listen on for intra-cluster data 6 | TORUSD_PEER_ADDRESS="--peer-address=http://127.0.0.1:4000" 7 | 8 | # Etcd configuration. Specify cert/key files 9 | TORUSD_ETCD="--etcd=127.0.0.1:2379" 10 | 11 | # Path to the data directory 12 | DATA_DIR="--data-dir=/var/lib/torusd/" 13 | 14 | # Other torud options 15 | TORUSD_OPTIONS="--host=127.0.0.1 --port=4321 --auto-join" 16 | -------------------------------------------------------------------------------- /contrib/systemd/torusd.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Torusd distributed storage server 3 | 4 | [Service] 5 | EnvironmentFile=/etc/sysconfig/torusd 6 | ExecStart=/usr/bin/torusd $TORUSD_PEER_ADDRESS $TORUSD_ETCD $DATA_DIR $TORUSD_OPTIONS 7 | ExecReload=/bin/kill -HUP $MAINPID 8 | Restart=on-failure 9 | #User=torus 10 | #Group=torus 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /dev-internal/release.md: -------------------------------------------------------------------------------- 1 | # Torus release guide 2 | 3 | This (evolving) guide outlines the exact steps necessary to do a release. 4 | 5 | ## Name the version 6 | 7 | ``` 8 | export VERSION=v0.1.0 9 | ``` 10 | 11 | ## Tag and push the tag 12 | 13 | ``` 14 | git tag $VERSION 15 | git push origin $VERSION 16 | ``` 17 | 18 | ## Build release binaries 19 | 20 | `make` respects the VERSION environment variable. You'll need [goxc](https://github.com/laher/goxc) 21 | 22 | ``` 23 | make release 24 | ``` 25 | 26 | ## Publish release on Github 27 | 28 | - Set release title as the version name. 29 | - Follow the format of previous release pages. 30 | - Attach the generated `.tar.gz`s 31 | - Select whether it is a pre-release. 32 | - Publish the release! 33 | 34 | ## Tag release on Quay.io 35 | -------------------------------------------------------------------------------- /distributor/client.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "net/url" 5 | "sync" 6 | "time" 7 | 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/distributor/protocols" 10 | "golang.org/x/net/context" 11 | ) 12 | 13 | const ( 14 | connectTimeout = 2 * time.Second 15 | rebalanceClientTimeout = 5 * time.Second 16 | clientTimeout = 500 * time.Millisecond 17 | writeClientTimeout = 2000 * time.Millisecond 18 | ) 19 | 20 | // TODO(barakmich): Clean up errors 21 | 22 | type distClient struct { 23 | dist *Distributor 24 | //TODO(barakmich): Better connection pooling 25 | openConns map[string]protocols.RPC 26 | mut sync.Mutex 27 | } 28 | 29 | func newDistClient(d *Distributor) *distClient { 30 | 31 | client := &distClient{ 32 | dist: d, 33 | openConns: make(map[string]protocols.RPC), 34 | } 35 | d.srv.AddTimeoutCallback(client.onPeerTimeout) 36 | return client 37 | } 38 | 39 | func (d *distClient) onPeerTimeout(uuid string) { 40 | d.mut.Lock() 41 | defer d.mut.Unlock() 42 | conn, ok := d.openConns[uuid] 43 | if !ok { 44 | return 45 | } 46 | err := conn.Close() 47 | if err != nil { 48 | clog.Errorf("peer timeout err on close: %s", err) 49 | } 50 | delete(d.openConns, uuid) 51 | 52 | } 53 | func (d *distClient) getConn(uuid string) protocols.RPC { 54 | d.mut.Lock() 55 | if conn, ok := d.openConns[uuid]; ok { 56 | d.mut.Unlock() 57 | return conn 58 | } 59 | d.mut.Unlock() 60 | pm := d.dist.srv.GetPeerMap() 61 | pi := pm[uuid] 62 | if pi == nil { 63 | // We know this UUID exists, we don't have an address for it, let's refresh now. 64 | pm := d.dist.srv.UpdatePeerMap() 65 | pi = pm[uuid] 66 | if pi == nil { 67 | // Not much more we can try 68 | return nil 69 | } 70 | } 71 | if pi.TimedOut { 72 | return nil 73 | } 74 | uri, err := url.Parse(pi.Address) 75 | if err != nil { 76 | clog.Errorf("couldn't parse address %s: %v", pi.Address, err) 77 | return nil 78 | } 79 | gmd := d.dist.srv.MDS.GlobalMetadata() 80 | conn, err := protocols.DialRPC(uri, connectTimeout, gmd) 81 | d.mut.Lock() 82 | defer d.mut.Unlock() 83 | if err != nil { 84 | clog.Errorf("couldn't dial: %v", err) 85 | return nil 86 | } 87 | d.openConns[uuid] = conn 88 | return conn 89 | } 90 | 91 | func (d *distClient) resetConn(uuid string) { 92 | d.mut.Lock() 93 | defer d.mut.Unlock() 94 | conn, ok := d.openConns[uuid] 95 | if !ok { 96 | return 97 | } 98 | delete(d.openConns, uuid) 99 | err := conn.Close() 100 | if err != nil { 101 | clog.Errorf("error resetConn: %s", err) 102 | } 103 | } 104 | 105 | func (d *distClient) Close() error { 106 | d.mut.Lock() 107 | defer d.mut.Unlock() 108 | for _, c := range d.openConns { 109 | err := c.Close() 110 | if err != nil { 111 | return err 112 | } 113 | } 114 | return nil 115 | } 116 | 117 | func (d *distClient) GetBlock(ctx context.Context, uuid string, b torus.BlockRef) ([]byte, error) { 118 | conn := d.getConn(uuid) 119 | if conn == nil { 120 | return nil, torus.ErrNoPeer 121 | } 122 | data, err := conn.Block(ctx, b) 123 | if err != nil { 124 | d.resetConn(uuid) 125 | clog.Debug(err) 126 | return nil, torus.ErrBlockUnavailable 127 | } 128 | return data, nil 129 | } 130 | 131 | func (d *distClient) PutBlock(ctx context.Context, uuid string, b torus.BlockRef, data []byte) error { 132 | conn := d.getConn(uuid) 133 | if conn == nil { 134 | return torus.ErrNoPeer 135 | } 136 | err := conn.PutBlock(ctx, b, data) 137 | if err != nil { 138 | d.resetConn(uuid) 139 | if err == context.DeadlineExceeded { 140 | return torus.ErrBlockUnavailable 141 | } 142 | } 143 | return err 144 | } 145 | 146 | func (d *distClient) Check(ctx context.Context, uuid string, blks []torus.BlockRef) ([]bool, error) { 147 | conn := d.getConn(uuid) 148 | if conn == nil { 149 | return nil, torus.ErrNoPeer 150 | } 151 | resp, err := conn.RebalanceCheck(ctx, blks) 152 | if err != nil { 153 | d.resetConn(uuid) 154 | return nil, err 155 | } 156 | return resp, nil 157 | } 158 | -------------------------------------------------------------------------------- /distributor/distributor.go: -------------------------------------------------------------------------------- 1 | // distributor is a complex implementation of a Torus storage interface, that 2 | // understands rebalancing it's underlying storage and fetching data from peers, 3 | // as necessary. 4 | package distributor 5 | 6 | import ( 7 | "net/url" 8 | "sync" 9 | 10 | "github.com/coreos/pkg/capnslog" 11 | "github.com/coreos/torus" 12 | "github.com/coreos/torus/distributor/protocols" 13 | "github.com/coreos/torus/distributor/rebalance" 14 | "github.com/coreos/torus/gc" 15 | ) 16 | 17 | var ( 18 | clog = capnslog.NewPackageLogger("github.com/coreos/torus", "distributor") 19 | ) 20 | 21 | type Distributor struct { 22 | mut sync.RWMutex 23 | blocks torus.BlockStore 24 | srv *torus.Server 25 | client *distClient 26 | rpcSrv protocols.RPCServer 27 | readCache *cache 28 | 29 | ring torus.Ring 30 | closed bool 31 | rebalancerChan chan struct{} 32 | ringWatcherChan chan struct{} 33 | rebalancer rebalance.Rebalancer 34 | rebalancing bool 35 | } 36 | 37 | func newDistributor(srv *torus.Server, addr *url.URL) (*Distributor, error) { 38 | var err error 39 | d := &Distributor{ 40 | blocks: srv.Blocks, 41 | srv: srv, 42 | } 43 | gmd := d.srv.MDS.GlobalMetadata() 44 | if addr != nil { 45 | d.rpcSrv, err = protocols.ListenRPC(addr, d, gmd) 46 | if err != nil { 47 | return nil, err 48 | } 49 | } 50 | if srv.Cfg.ReadCacheSize != 0 { 51 | size := srv.Cfg.ReadCacheSize / gmd.BlockSize 52 | if size < 100 { 53 | size = 100 54 | } 55 | d.readCache = newCache(int(size)) 56 | } 57 | 58 | // Set up the rebalancer 59 | d.ring, err = d.srv.MDS.GetRing() 60 | if err != nil { 61 | return nil, err 62 | } 63 | d.ringWatcherChan = make(chan struct{}) 64 | go d.ringWatcher(d.rebalancerChan) 65 | d.client = newDistClient(d) 66 | g := gc.NewGCController(d.srv, torus.NewINodeStore(d)) 67 | d.rebalancer = rebalance.NewRebalancer(d, d.blocks, d.client, g) 68 | d.rebalancerChan = make(chan struct{}) 69 | go d.rebalanceTicker(d.rebalancerChan) 70 | return d, nil 71 | } 72 | 73 | func (d *Distributor) UUID() string { 74 | return d.srv.MDS.UUID() 75 | } 76 | 77 | func (d *Distributor) Ring() torus.Ring { 78 | d.mut.RLock() 79 | defer d.mut.RUnlock() 80 | return d.ring 81 | } 82 | 83 | func (d *Distributor) Close() error { 84 | d.mut.Lock() 85 | defer d.mut.Unlock() 86 | if d.closed { 87 | return nil 88 | } 89 | close(d.rebalancerChan) 90 | close(d.ringWatcherChan) 91 | if d.rpcSrv != nil { 92 | d.rpcSrv.Close() 93 | } 94 | d.client.Close() 95 | err := d.blocks.Close() 96 | if err != nil { 97 | return err 98 | } 99 | d.closed = true 100 | return nil 101 | } 102 | -------------------------------------------------------------------------------- /distributor/distributor_test.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "fmt" 5 | "net/url" 6 | "testing" 7 | "time" 8 | 9 | "github.com/coreos/torus" 10 | "github.com/coreos/torus/metadata/temp" 11 | 12 | _ "github.com/coreos/torus/storage" 13 | ) 14 | 15 | func newServer(md *temp.Server) *torus.Server { 16 | cfg := torus.Config{ 17 | StorageSize: 100 * 1024 * 1024, 18 | } 19 | mds := temp.NewClient(cfg, md) 20 | gmd := mds.GlobalMetadata() 21 | blocks, _ := torus.CreateBlockStore("temp", "current", cfg, gmd) 22 | s, _ := torus.NewServerByImpl(cfg, mds, blocks) 23 | return s 24 | } 25 | 26 | func createThree(t *testing.T) ([]*torus.Server, *temp.Server) { 27 | var out []*torus.Server 28 | s := temp.NewServer() 29 | for i := 0; i < 3; i++ { 30 | srv := newServer(s) 31 | addr := fmt.Sprintf("http://127.0.0.1:%d", 40000+i) 32 | uri, err := url.Parse(addr) 33 | if err != nil { 34 | t.Fatal(err) 35 | } 36 | err = ListenReplication(srv, uri) 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | out = append(out, srv) 41 | } 42 | // Heartbeat 43 | time.Sleep(10 * time.Millisecond) 44 | return out, s 45 | } 46 | 47 | func closeAll(t *testing.T, c ...*torus.Server) { 48 | for _, x := range c { 49 | err := x.Close() 50 | if err != nil { 51 | t.Fatal(err) 52 | } 53 | } 54 | } 55 | 56 | func TestWithThree(t *testing.T) { 57 | t.Log("OpenClose") 58 | testOpenClose(t) 59 | } 60 | 61 | func testOpenClose(t *testing.T) { 62 | srvs, md := createThree(t) 63 | p, err := srvs[0].MDS.GetPeers() 64 | if err != nil { 65 | t.Fatal(err) 66 | } 67 | if len(p) != 3 { 68 | t.Fatalf("expected 3 nodes, got %d", len(p)) 69 | } 70 | closeAll(t, srvs...) 71 | md.Close() 72 | } 73 | -------------------------------------------------------------------------------- /distributor/lru.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | ) 7 | 8 | // cache implements an LRU cache. 9 | type cache struct { 10 | cache map[string]*list.Element 11 | priority *list.List 12 | maxSize int 13 | mut sync.Mutex 14 | } 15 | 16 | type kv struct { 17 | key string 18 | value interface{} 19 | } 20 | 21 | func newCache(size int) *cache { 22 | var lru cache 23 | lru.maxSize = size 24 | lru.priority = list.New() 25 | lru.cache = make(map[string]*list.Element) 26 | return &lru 27 | } 28 | 29 | func (lru *cache) Put(key string, value interface{}) { 30 | if lru == nil { 31 | return 32 | } 33 | lru.mut.Lock() 34 | defer lru.mut.Unlock() 35 | if _, ok := lru.get(key); ok { 36 | return 37 | } 38 | if len(lru.cache) == lru.maxSize { 39 | lru.removeOldest() 40 | } 41 | lru.priority.PushFront(kv{key: key, value: value}) 42 | lru.cache[key] = lru.priority.Front() 43 | } 44 | 45 | func (lru *cache) Get(key string) (interface{}, bool) { 46 | if lru == nil { 47 | return nil, false 48 | } 49 | lru.mut.Lock() 50 | defer lru.mut.Unlock() 51 | return lru.get(key) 52 | } 53 | 54 | func (lru *cache) get(key string) (interface{}, bool) { 55 | if element, ok := lru.cache[key]; ok { 56 | lru.priority.MoveToFront(element) 57 | return element.Value.(kv).value, true 58 | } 59 | return nil, false 60 | } 61 | 62 | func (lru *cache) removeOldest() { 63 | last := lru.priority.Remove(lru.priority.Back()) 64 | delete(lru.cache, last.(kv).key) 65 | } 66 | -------------------------------------------------------------------------------- /distributor/monitoring.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import "github.com/prometheus/client_golang/prometheus" 4 | 5 | var ( 6 | // Blocks 7 | promDistBlockRequests = prometheus.NewCounter(prometheus.CounterOpts{ 8 | Name: "torus_distributor_block_requests_total", 9 | Help: "Total number of blocks requested of the distributor layer", 10 | }) 11 | promDistBlockCacheHits = prometheus.NewCounter(prometheus.CounterOpts{ 12 | Name: "torus_distributor_block_cached_blocks", 13 | Help: "Number of blocks returned from read cache of the distributor layer", 14 | }) 15 | promDistBlockLocalHits = prometheus.NewCounter(prometheus.CounterOpts{ 16 | Name: "torus_distributor_block_local_blocks", 17 | Help: "Number of blocks returned from local storage", 18 | }) 19 | promDistBlockLocalFailures = prometheus.NewCounter(prometheus.CounterOpts{ 20 | Name: "torus_distributor_block_local_block_fails", 21 | Help: "Number of blocks requested from local storage that weren't found", 22 | }) 23 | promDistBlockPeerHits = prometheus.NewCounterVec(prometheus.CounterOpts{ 24 | Name: "torus_distributor_block_peer_blocks", 25 | Help: "Number of blocks returned from another peer in the cluster", 26 | }, []string{"peer"}) 27 | promDistBlockPeerFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ 28 | Name: "torus_distributor_block_peer_block_fails", 29 | Help: "Number of failures incurred in retrieving a block from a peer", 30 | }, []string{"peer"}) 31 | promDistBlockFailures = prometheus.NewCounter(prometheus.CounterOpts{ 32 | Name: "torus_distributor_block_request_failures", 33 | Help: "Number of failed block requests", 34 | }) 35 | // RPCs 36 | promDistPutBlockRPCs = prometheus.NewCounter(prometheus.CounterOpts{ 37 | Name: "torus_distributor_put_block_rpcs_total", 38 | Help: "Number of PutBlock RPCs made to this node", 39 | }) 40 | promDistPutBlockRPCFailures = prometheus.NewCounter(prometheus.CounterOpts{ 41 | Name: "torus_distributor_put_block_rpc_failures", 42 | Help: "Number of PutBlock RPCs with errors", 43 | }) 44 | promDistBlockRPCs = prometheus.NewCounter(prometheus.CounterOpts{ 45 | Name: "torus_distributor_block_rpcs_total", 46 | Help: "Number of PutBlock RPCs made to this node", 47 | }) 48 | promDistBlockRPCFailures = prometheus.NewCounter(prometheus.CounterOpts{ 49 | Name: "torus_distributor_block_rpc_failures", 50 | Help: "Number of PutBlock RPCs with errors", 51 | }) 52 | promDistRebalanceRPCs = prometheus.NewCounter(prometheus.CounterOpts{ 53 | Name: "torus_distributor_rebalance_rpcs_total", 54 | Help: "Number of Rebalance RPCs made to this node", 55 | }) 56 | promDistRebalanceRPCFailures = prometheus.NewCounter(prometheus.CounterOpts{ 57 | Name: "torus_distributor_rebalance_rpc_failures", 58 | Help: "Number of Rebalance RPCs with errors", 59 | }) 60 | ) 61 | 62 | func init() { 63 | // Block 64 | prometheus.MustRegister(promDistBlockRequests) 65 | prometheus.MustRegister(promDistBlockCacheHits) 66 | prometheus.MustRegister(promDistBlockLocalHits) 67 | prometheus.MustRegister(promDistBlockLocalFailures) 68 | prometheus.MustRegister(promDistBlockPeerHits) 69 | prometheus.MustRegister(promDistBlockPeerFailures) 70 | prometheus.MustRegister(promDistBlockFailures) 71 | // RPC 72 | prometheus.MustRegister(promDistPutBlockRPCs) 73 | prometheus.MustRegister(promDistPutBlockRPCFailures) 74 | prometheus.MustRegister(promDistBlockRPCs) 75 | prometheus.MustRegister(promDistBlockRPCFailures) 76 | prometheus.MustRegister(promDistRebalanceRPCs) 77 | prometheus.MustRegister(promDistRebalanceRPCFailures) 78 | } 79 | -------------------------------------------------------------------------------- /distributor/protocols/grpc/grpc.go: -------------------------------------------------------------------------------- 1 | package grpc 2 | 3 | import ( 4 | "net" 5 | "net/url" 6 | "strings" 7 | "time" 8 | 9 | "google.golang.org/grpc" 10 | 11 | "golang.org/x/net/context" 12 | 13 | "github.com/coreos/torus" 14 | "github.com/coreos/torus/distributor/protocols" 15 | "github.com/coreos/torus/models" 16 | ) 17 | 18 | const defaultPort = "40000" 19 | 20 | func init() { 21 | protocols.RegisterRPCListener("http", grpcRPCListener) 22 | protocols.RegisterRPCDialer("http", grpcRPCDialer) 23 | } 24 | 25 | func grpcRPCListener(url *url.URL, hdl protocols.RPC, gmd torus.GlobalMetadata) (protocols.RPCServer, error) { 26 | out := &handler{ 27 | handle: hdl, 28 | } 29 | h := url.Host 30 | if !strings.Contains(h, ":") { 31 | h = net.JoinHostPort(h, defaultPort) 32 | } 33 | lis, err := net.Listen("tcp", h) 34 | if err != nil { 35 | return nil, err 36 | } 37 | out.grpc = grpc.NewServer() 38 | models.RegisterTorusStorageServer(out.grpc, out) 39 | go out.grpc.Serve(lis) 40 | return out, nil 41 | } 42 | 43 | func grpcRPCDialer(url *url.URL, timeout time.Duration, gmd torus.GlobalMetadata) (protocols.RPC, error) { 44 | h := url.Host 45 | if !strings.Contains(h, ":") { 46 | h = net.JoinHostPort(h, defaultPort) 47 | } 48 | conn, err := grpc.Dial(h, grpc.WithInsecure(), grpc.WithTimeout(timeout)) 49 | if err != nil { 50 | return nil, err 51 | } 52 | return &client{ 53 | conn: conn, 54 | handler: models.NewTorusStorageClient(conn), 55 | }, nil 56 | } 57 | 58 | type client struct { 59 | conn *grpc.ClientConn 60 | handler models.TorusStorageClient 61 | } 62 | 63 | func (c *client) Close() error { 64 | return c.conn.Close() 65 | } 66 | 67 | func (c *client) PutBlock(ctx context.Context, ref torus.BlockRef, data []byte) error { 68 | _, err := c.handler.PutBlock(ctx, &models.PutBlockRequest{ 69 | Refs: []*models.BlockRef{ 70 | ref.ToProto(), 71 | }, 72 | Blocks: [][]byte{ 73 | data, 74 | }, 75 | }) 76 | return err 77 | } 78 | 79 | func (c *client) Block(ctx context.Context, ref torus.BlockRef) ([]byte, error) { 80 | resp, err := c.handler.Block(ctx, &models.BlockRequest{ 81 | BlockRef: ref.ToProto(), 82 | }) 83 | if err != nil { 84 | return nil, err 85 | } 86 | return resp.Data, nil 87 | } 88 | 89 | func (c *client) RebalanceCheck(ctx context.Context, refs []torus.BlockRef) ([]bool, error) { 90 | req := &models.RebalanceCheckRequest{} 91 | for _, x := range refs { 92 | req.BlockRefs = append(req.BlockRefs, x.ToProto()) 93 | } 94 | resp, err := c.handler.RebalanceCheck(ctx, req) 95 | if err != nil { 96 | return nil, err 97 | } 98 | return resp.Valid, nil 99 | } 100 | 101 | func (c *client) WriteBuf(ctx context.Context, ref torus.BlockRef) ([]byte, error) { 102 | panic("unimplemented") 103 | } 104 | 105 | type handler struct { 106 | handle protocols.RPC 107 | grpc *grpc.Server 108 | } 109 | 110 | func (h *handler) Block(ctx context.Context, req *models.BlockRequest) (*models.BlockResponse, error) { 111 | data, err := h.handle.Block(ctx, torus.BlockFromProto(req.BlockRef)) 112 | if err != nil { 113 | return nil, err 114 | } 115 | return &models.BlockResponse{ 116 | Ok: true, 117 | Data: data, 118 | }, nil 119 | } 120 | 121 | func (h *handler) PutBlock(ctx context.Context, req *models.PutBlockRequest) (*models.PutResponse, error) { 122 | for i, ref := range req.Refs { 123 | err := h.handle.PutBlock(ctx, torus.BlockFromProto(ref), req.Blocks[i]) 124 | if err != nil { 125 | return nil, err 126 | } 127 | } 128 | return &models.PutResponse{Ok: true}, nil 129 | } 130 | 131 | func (h *handler) RebalanceCheck(ctx context.Context, req *models.RebalanceCheckRequest) (*models.RebalanceCheckResponse, error) { 132 | check := make([]torus.BlockRef, len(req.BlockRefs)) 133 | for i, x := range req.BlockRefs { 134 | check[i] = torus.BlockFromProto(x) 135 | } 136 | out, err := h.handle.RebalanceCheck(ctx, check) 137 | if err != nil { 138 | return nil, err 139 | } 140 | return &models.RebalanceCheckResponse{ 141 | Valid: out, 142 | }, nil 143 | } 144 | 145 | func (h *handler) Close() error { 146 | h.grpc.Stop() 147 | return nil 148 | } 149 | -------------------------------------------------------------------------------- /distributor/protocols/protocols.go: -------------------------------------------------------------------------------- 1 | // protocols is the metapackage for the RPC protocols for how Torus' storage 2 | // layer communicates with other storage servers. 3 | package protocols 4 | 5 | import ( 6 | "fmt" 7 | "net/url" 8 | "time" 9 | 10 | "golang.org/x/net/context" 11 | 12 | "github.com/coreos/torus" 13 | ) 14 | 15 | type RPC interface { 16 | PutBlock(ctx context.Context, ref torus.BlockRef, data []byte) error 17 | Block(ctx context.Context, ref torus.BlockRef) ([]byte, error) 18 | RebalanceCheck(ctx context.Context, refs []torus.BlockRef) ([]bool, error) 19 | Close() error 20 | 21 | // This is a little bit of a hack to avoid more allocations. 22 | WriteBuf(ctx context.Context, ref torus.BlockRef) ([]byte, error) 23 | } 24 | 25 | type RPCServer interface { 26 | Close() error 27 | } 28 | 29 | type RPCDialerFunc func(*url.URL, time.Duration, torus.GlobalMetadata) (RPC, error) 30 | type RPCListenerFunc func(*url.URL, RPC, torus.GlobalMetadata) (RPCServer, error) 31 | 32 | var rpcDialers map[string]RPCDialerFunc 33 | var rpcListeners map[string]RPCListenerFunc 34 | 35 | func RegisterRPCListener(scheme string, newFunc RPCListenerFunc) { 36 | if rpcListeners == nil { 37 | rpcListeners = make(map[string]RPCListenerFunc) 38 | } 39 | 40 | if _, ok := rpcListeners[scheme]; ok { 41 | panic("torus: attempted to register RPC Listener with scheme " + scheme + " twice") 42 | } 43 | 44 | rpcListeners[scheme] = newFunc 45 | } 46 | 47 | func ListenRPC(url *url.URL, handler RPC, gmd torus.GlobalMetadata) (RPCServer, error) { 48 | if rpcListeners[url.Scheme] == nil { 49 | return nil, fmt.Errorf("Unknown ListenRPC protocol '%s'", url.Scheme) 50 | } 51 | 52 | return rpcListeners[url.Scheme](url, handler, gmd) 53 | } 54 | 55 | func RegisterRPCDialer(scheme string, newFunc RPCDialerFunc) { 56 | if rpcDialers == nil { 57 | rpcDialers = make(map[string]RPCDialerFunc) 58 | } 59 | 60 | if _, ok := rpcDialers[scheme]; ok { 61 | panic("torus: attempted to register RPC Listener with scheme " + scheme + " twice") 62 | } 63 | 64 | rpcDialers[scheme] = newFunc 65 | } 66 | 67 | func DialRPC(url *url.URL, timeout time.Duration, gmd torus.GlobalMetadata) (RPC, error) { 68 | if rpcDialers[url.Scheme] == nil { 69 | return nil, fmt.Errorf("Unknown DialRPC protocol '%s'", url.Scheme) 70 | } 71 | 72 | return rpcDialers[url.Scheme](url, timeout, gmd) 73 | } 74 | -------------------------------------------------------------------------------- /distributor/protocols/tdp/bitset.go: -------------------------------------------------------------------------------- 1 | package tdp 2 | 3 | type bitset []byte 4 | 5 | func (b bitset) set(i int) bitset { 6 | off := i / 8 7 | for len(b) <= off { 8 | b = append(b, 0x00) 9 | } 10 | b[off] = b[off] | (0x01 << uint(i%8)) 11 | return b 12 | } 13 | 14 | func (b bitset) unset(i int) bitset { 15 | off := i / 8 16 | for len(b) <= off { 17 | b = append(b, 0x00) 18 | } 19 | b[off] = b[off] &^ (0x01 << uint(i%8)) 20 | return b 21 | } 22 | 23 | func (b bitset) test(i int) bool { 24 | off := i / 8 25 | if len(b) <= off { 26 | return false 27 | } 28 | return (b[off] & (0x01 << uint(i%8))) != 0 29 | } 30 | 31 | func bitsetFromBool(in []bool) bitset { 32 | b := bitset{} 33 | for i := 0; i < len(in); i++ { 34 | if in[i] { 35 | b = b.set(i) 36 | } else { 37 | b = b.unset(i) 38 | } 39 | } 40 | return b 41 | } 42 | 43 | func (b bitset) toBool(len int) []bool { 44 | out := make([]bool, len) 45 | for i := 0; i < len; i++ { 46 | s := b.test(i) 47 | out[i] = s 48 | } 49 | return out 50 | } 51 | -------------------------------------------------------------------------------- /distributor/protocols/tdp/bitset_test.go: -------------------------------------------------------------------------------- 1 | package tdp 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | ) 7 | 8 | func TestZeroLen(t *testing.T) { 9 | b := bitset{} 10 | out := b.toBool(3) 11 | if len(out) != 3 { 12 | t.Fatal("wrong length") 13 | } 14 | for _, x := range out { 15 | if x { 16 | t.Fatal("true value") 17 | } 18 | } 19 | } 20 | 21 | func TestSetOne(t *testing.T) { 22 | b := bitset{} 23 | b = b.set(10) 24 | out := b.toBool(11) 25 | if out[0] || !out[10] { 26 | t.Fatal("mismatch") 27 | } 28 | } 29 | 30 | func TestSetRandom(t *testing.T) { 31 | length := 1000 32 | truth := make([]bool, 1000) 33 | for i := 0; i < length; i++ { 34 | t := rand.Int()%2 == 1 35 | truth[i] = t 36 | } 37 | retruth := bitsetFromBool(truth) 38 | out := retruth.toBool(length) 39 | if len(out) != len(truth) { 40 | t.Fatal("wrong length") 41 | } 42 | for i := 0; i < len(out); i++ { 43 | if truth[i] != out[i] { 44 | t.Fatal("mismatch") 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /distributor/protocols/tdp/server.go: -------------------------------------------------------------------------------- 1 | package tdp 2 | 3 | import ( 4 | "net" 5 | "net/url" 6 | "strings" 7 | "time" 8 | 9 | "github.com/coreos/torus" 10 | "github.com/coreos/torus/distributor/protocols" 11 | ) 12 | 13 | const defaultPort = "40000" 14 | 15 | func init() { 16 | protocols.RegisterRPCListener("tdp", tdpRPCListener) 17 | protocols.RegisterRPCDialer("tdp", tdpRPCDialer) 18 | } 19 | 20 | func tdpRPCListener(url *url.URL, handler protocols.RPC, gmd torus.GlobalMetadata) (protocols.RPCServer, error) { 21 | if strings.Contains(url.Host, ":") { 22 | return Serve(url.Host, handler, gmd.BlockSize) 23 | } 24 | return Serve(net.JoinHostPort(url.Host, defaultPort), handler, gmd.BlockSize) 25 | } 26 | 27 | func tdpRPCDialer(url *url.URL, timeout time.Duration, gmd torus.GlobalMetadata) (protocols.RPC, error) { 28 | if strings.Contains(url.Host, ":") { 29 | return Dial(url.Host, timeout, gmd.BlockSize) 30 | } 31 | return Dial(net.JoinHostPort(url.Host, defaultPort), timeout, gmd.BlockSize) 32 | } 33 | -------------------------------------------------------------------------------- /distributor/rebalance.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "io" 5 | "math/rand" 6 | "time" 7 | 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/models" 10 | ) 11 | 12 | // Goroutine which watches for new rings and kicks off 13 | // the rebalance dance. 14 | func (d *Distributor) ringWatcher(closer chan struct{}) { 15 | ch := make(chan torus.Ring) 16 | d.srv.MDS.SubscribeNewRings(ch) 17 | exit: 18 | for { 19 | select { 20 | case <-closer: 21 | d.srv.MDS.UnsubscribeNewRings(ch) 22 | close(ch) 23 | break exit 24 | case newring, ok := <-ch: 25 | if ok { 26 | if newring.Version() == d.ring.Version() { 27 | // No problem. We're seeing the same ring. 28 | continue 29 | } 30 | if newring.Version() < d.ring.Version() { 31 | panic("replacing old ring with ring in the past!") 32 | } 33 | d.mut.Lock() 34 | d.ring = newring 35 | d.mut.Unlock() 36 | } else { 37 | break exit 38 | } 39 | } 40 | } 41 | } 42 | 43 | func (d *Distributor) rebalanceTicker(closer chan struct{}) { 44 | n := 0 45 | total := 0 46 | time.Sleep(time.Duration(250+rand.Intn(250)) * time.Millisecond) 47 | exit: 48 | for { 49 | clog.Tracef("starting rebalance/gc cycle") 50 | volset, _, err := d.srv.MDS.GetVolumes() 51 | if err != nil { 52 | clog.Error(err) 53 | } 54 | for _, x := range volset { 55 | err := d.rebalancer.PrepVolume(x) 56 | if err != nil { 57 | clog.Errorf("gc prep for %s failed: %s", x.Name, err) 58 | } 59 | } 60 | ratelimit: 61 | for { 62 | timeout := 2 * time.Duration(n+1) * time.Millisecond 63 | select { 64 | case <-closer: 65 | break exit 66 | case <-time.After(timeout): 67 | written, err := d.rebalancer.Tick() 68 | if d.ring.Version() != d.rebalancer.VersionStart() { 69 | // Something is changed -- we are now rebalancing 70 | d.rebalancing = true 71 | } 72 | info := &models.RebalanceInfo{ 73 | Rebalancing: d.rebalancing, 74 | } 75 | total += written 76 | info.LastRebalanceBlocks = uint64(total) 77 | if err == io.EOF { 78 | // Good job, sleep well, I'll most likely rebalance you in the morning. 79 | info.LastRebalanceFinish = time.Now().UnixNano() 80 | total = 0 81 | finishver := d.rebalancer.VersionStart() 82 | if finishver == d.ring.Version() { 83 | d.rebalancing = false 84 | info.Rebalancing = false 85 | } 86 | d.srv.UpdateRebalanceInfo(info) 87 | break ratelimit 88 | } else if err != nil { 89 | // This is usually really bad 90 | clog.Error(err) 91 | } 92 | n = written 93 | d.srv.UpdateRebalanceInfo(info) 94 | } 95 | } 96 | time.Sleep(time.Duration(rand.Intn(3)) * time.Second) 97 | d.rebalancer.Reset() 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /distributor/rebalance/rebalancer.go: -------------------------------------------------------------------------------- 1 | // rebalance provides the implementation of the rebalancer, which continually 2 | // checks the data stored on a host, knows where data should live, and moves it 3 | // to the appropriate servers. 4 | package rebalance 5 | 6 | import ( 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/gc" 10 | "github.com/coreos/torus/models" 11 | "golang.org/x/net/context" 12 | ) 13 | 14 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "rebalance") 15 | 16 | type Ringer interface { 17 | Ring() torus.Ring 18 | UUID() string 19 | } 20 | 21 | type Rebalancer interface { 22 | Tick() (int, error) 23 | VersionStart() int 24 | PrepVolume(*models.Volume) error 25 | Reset() error 26 | } 27 | 28 | type CheckAndSender interface { 29 | Check(ctx context.Context, peer string, refs []torus.BlockRef) ([]bool, error) 30 | PutBlock(ctx context.Context, peer string, ref torus.BlockRef, data []byte) error 31 | } 32 | 33 | func NewRebalancer(r Ringer, bs torus.BlockStore, cs CheckAndSender, gc gc.GC) Rebalancer { 34 | return &rebalancer{ 35 | r: r, 36 | bs: bs, 37 | cs: cs, 38 | gc: gc, 39 | } 40 | } 41 | 42 | type rebalancer struct { 43 | r Ringer 44 | bs torus.BlockStore 45 | cs CheckAndSender 46 | it torus.BlockIterator 47 | gc gc.GC 48 | ring torus.Ring 49 | } 50 | 51 | func (r *rebalancer) VersionStart() int { 52 | if r.ring == nil { 53 | return r.r.Ring().Version() 54 | } 55 | return r.ring.Version() 56 | } 57 | 58 | func (r *rebalancer) PrepVolume(vol *models.Volume) error { 59 | return r.gc.PrepVolume(vol) 60 | } 61 | 62 | func (r *rebalancer) Reset() error { 63 | if r.it != nil { 64 | r.it.Close() 65 | r.it = nil 66 | } 67 | r.gc.Clear() 68 | return nil 69 | } 70 | -------------------------------------------------------------------------------- /distributor/rebalance/tick.go: -------------------------------------------------------------------------------- 1 | package rebalance 2 | 3 | import ( 4 | "io" 5 | "time" 6 | 7 | "golang.org/x/net/context" 8 | 9 | "github.com/coreos/pkg/capnslog" 10 | "github.com/coreos/torus" 11 | ) 12 | 13 | const maxIters = 50 14 | 15 | var rebalanceTimeout = 5 * time.Second 16 | 17 | func (r *rebalancer) Tick() (int, error) { 18 | if r.it == nil { 19 | r.it = r.bs.BlockIterator() 20 | r.ring = r.r.Ring() 21 | } 22 | m := make(map[string][]torus.BlockRef) 23 | toDelete := make(map[torus.BlockRef]bool) 24 | dead := make(map[torus.BlockRef]bool) 25 | itDone := false 26 | 27 | for i := 0; i < maxIters; i++ { 28 | var ref torus.BlockRef 29 | ok := r.it.Next() 30 | if !ok { 31 | err := r.it.Err() 32 | if err != nil { 33 | return 0, err 34 | } 35 | itDone = true 36 | break 37 | } 38 | ref = r.it.BlockRef() 39 | if r.gc.IsDead(ref) { 40 | dead[ref] = true 41 | continue 42 | } 43 | perm, err := r.ring.GetPeers(ref) 44 | if err != nil { 45 | return 0, err 46 | } 47 | desired := torus.PeerList(perm.Peers[:perm.Replication]) 48 | myIndex := desired.IndexAt(r.r.UUID()) 49 | for j, p := range desired { 50 | if j == myIndex { 51 | continue 52 | } 53 | m[p] = append(m[p], ref) 54 | } 55 | if myIndex == -1 { 56 | toDelete[ref] = true 57 | } 58 | } 59 | 60 | n := 0 61 | for k, v := range m { 62 | ctx, cancel := context.WithTimeout(context.TODO(), rebalanceTimeout) 63 | oks, err := r.cs.Check(ctx, k, v) 64 | cancel() 65 | if err != nil { 66 | for _, blk := range v { 67 | toDelete[blk] = false 68 | } 69 | if err != torus.ErrNoPeer { 70 | clog.Error(err) 71 | } 72 | continue 73 | } 74 | for i, ok := range oks { 75 | if !ok { 76 | data, err := r.bs.GetBlock(context.TODO(), v[i]) 77 | if err != nil { 78 | clog.Warningf("couldn't get local block %s: %v", v[i], err) 79 | continue 80 | } 81 | n++ 82 | ctx, cancel := context.WithTimeout(context.TODO(), rebalanceTimeout) 83 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 84 | torus.BlockLog.Tracef("rebalance: sending block %s to %s", v[i], k) 85 | } 86 | err = r.cs.PutBlock(ctx, k, v[i], data) 87 | cancel() 88 | if err != nil { 89 | // Continue for now 90 | toDelete[v[i]] = false 91 | clog.Errorf("couldn't rebalance block %s: %v", v[i], err) 92 | } 93 | } 94 | } 95 | } 96 | 97 | for k, v := range toDelete { 98 | if v { 99 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 100 | torus.BlockLog.Tracef("rebalance: deleting replicated block %s", k) 101 | } 102 | err := r.bs.DeleteBlock(context.TODO(), k) 103 | if err != nil { 104 | clog.Errorf("couldn't delete replicated local block %s: %v", k, err) 105 | } 106 | } 107 | } 108 | 109 | for k, v := range dead { 110 | if v { 111 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 112 | torus.BlockLog.Tracef("rebalance: deleting dead block %s", k) 113 | } 114 | err := r.bs.DeleteBlock(context.TODO(), k) 115 | if err != nil { 116 | clog.Errorf("couldn't delete dead local block %s: %v", k, err) 117 | } 118 | } 119 | } 120 | err := r.bs.Flush() 121 | if err != nil { 122 | clog.Errorf("Failed to flush: %v", err) 123 | } 124 | 125 | if itDone { 126 | return n, io.EOF 127 | } 128 | return n, nil 129 | } 130 | -------------------------------------------------------------------------------- /distributor/replication.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "net/url" 5 | "strings" 6 | 7 | "github.com/coreos/torus" 8 | 9 | // Import all the protocols we understand 10 | _ "github.com/coreos/torus/distributor/protocols/grpc" 11 | _ "github.com/coreos/torus/distributor/protocols/tdp" 12 | ) 13 | 14 | // ListenReplication opens the internal networking port and connects to the cluster 15 | func ListenReplication(s *torus.Server, addr *url.URL) error { 16 | return openReplication(s, addr) 17 | } 18 | 19 | // OpenReplication connects to the cluster without opening the internal networking. 20 | func OpenReplication(s *torus.Server) error { 21 | return openReplication(s, nil) 22 | } 23 | func openReplication(s *torus.Server, addr *url.URL) error { 24 | var err error 25 | if s.ReplicationOpen { 26 | return torus.ErrExists 27 | } 28 | dist, err := newDistributor(s, addr) 29 | if err != nil { 30 | return err 31 | } 32 | s.Blocks = dist 33 | s.INodes = torus.NewINodeStore(dist) 34 | err = s.BeginHeartbeat(addr) 35 | if err != nil { 36 | return err 37 | } 38 | s.ReplicationOpen = true 39 | return nil 40 | } 41 | 42 | func addrToUri(addr string) (*url.URL, error) { 43 | if strings.Contains(addr, "://") { 44 | // Looks like a full uri 45 | return url.Parse(addr) 46 | } 47 | return url.Parse("http://" + addr) 48 | } 49 | -------------------------------------------------------------------------------- /distributor/rpc.go: -------------------------------------------------------------------------------- 1 | package distributor 2 | 3 | import ( 4 | "github.com/coreos/pkg/capnslog" 5 | "github.com/coreos/torus" 6 | "golang.org/x/net/context" 7 | ) 8 | 9 | func (d *Distributor) Block(ctx context.Context, ref torus.BlockRef) ([]byte, error) { 10 | promDistBlockRPCs.Inc() 11 | data, err := d.blocks.GetBlock(ctx, ref) 12 | if err != nil { 13 | promDistBlockRPCFailures.Inc() 14 | clog.Warningf("remote asking for non-existent block: %s", ref) 15 | return nil, torus.ErrBlockUnavailable 16 | } 17 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 18 | torus.BlockLog.Tracef("rpc: retrieved block %s", ref) 19 | } 20 | return data, nil 21 | } 22 | 23 | func (d *Distributor) PutBlock(ctx context.Context, ref torus.BlockRef, data []byte) error { 24 | d.mut.RLock() 25 | defer d.mut.RUnlock() 26 | promDistPutBlockRPCs.Inc() 27 | peers, err := d.ring.GetPeers(ref) 28 | if err != nil { 29 | promDistPutBlockRPCFailures.Inc() 30 | return err 31 | } 32 | ok := false 33 | for _, x := range peers.Peers { 34 | if x == d.UUID() { 35 | ok = true 36 | break 37 | } 38 | } 39 | if !ok { 40 | clog.Warningf("trying to write block that doesn't belong to me.") 41 | } 42 | err = d.blocks.WriteBlock(ctx, ref, data) 43 | if err != nil { 44 | return err 45 | } 46 | if torus.BlockLog.LevelAt(capnslog.TRACE) { 47 | torus.BlockLog.Tracef("rpc: saving block %s", ref) 48 | } 49 | return d.Flush() 50 | } 51 | 52 | func (d *Distributor) RebalanceCheck(ctx context.Context, refs []torus.BlockRef) ([]bool, error) { 53 | out := make([]bool, len(refs)) 54 | for i, x := range refs { 55 | ok, err := d.blocks.HasBlock(ctx, x) 56 | if err != nil { 57 | clog.Error(err) 58 | return nil, err 59 | } 60 | out[i] = ok 61 | } 62 | return out, nil 63 | } 64 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | : ${LISTEN_HOST:=127.0.0.1} 5 | : ${PEER_ADDRESS:="http://$LISTEN_HOST:40000"} 6 | : ${LISTEN_HTTP_PORT:=4321} 7 | : ${ETCD_HOST:=127.0.0.1} 8 | : ${ETCD_PORT:=2379} 9 | : ${DEBUG:=0} 10 | : ${STORAGE_SIZE:=2GiB} 11 | : ${AUTO_JOIN:=0} 12 | : ${DEBUG_INIT:=0} 13 | : ${DROP_MOUNT_BIN:=0} 14 | : ${LOG_FLAGS:=""} 15 | 16 | TORUS_FLAGS="" 17 | if [ ${DEBUG} -eq "1" ]; then 18 | TORUS_FLAGS="$TORUS_FLAGS --debug" 19 | fi 20 | 21 | if [ ${AUTO_JOIN} -eq "1" ]; then 22 | TORUS_FLAGS="$TORUS_FLAGS --auto-join" 23 | fi 24 | 25 | if [ ${DEBUG_INIT} -eq "1" ]; then 26 | TORUS_FLAGS="$TORUS_FLAGS --debug-init" 27 | fi 28 | 29 | if [ ${DROP_MOUNT_BIN} -eq "1" ]; then 30 | mkdir -p /plugin/coreos.com~torus 31 | cp `which torusblk` /plugin/coreos.com~torus/torus 32 | fi 33 | 34 | if [ "${LOG_FLAGS}" != "" ]; then 35 | TORUS_FLAGS="$TORUS_FLAGS --logpkg=${LOG_FLAGS}" 36 | fi 37 | 38 | torusd --etcd $ETCD_HOST:$ETCD_PORT --host $LISTEN_HOST --port $LISTEN_HTTP_PORT --data-dir /data --peer-address $PEER_ADDRESS --size $STORAGE_SIZE $TORUS_FLAGS 39 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import "errors" 4 | 5 | var ( 6 | // ErrBlockUnavailable is returned when a function fails to retrieve a known 7 | // block. 8 | ErrBlockUnavailable = errors.New("torus: block cannot be retrieved") 9 | 10 | // ErrINodeUnavailable is returned when a function fails to retrieve a known 11 | // INode. 12 | ErrINodeUnavailable = errors.New("torus: inode cannot be retrieved") 13 | 14 | // ErrBlockNotExist is returned when a function attempts to manipulate a 15 | // non-existent block. 16 | ErrBlockNotExist = errors.New("torus: block doesn't exist") 17 | 18 | // ErrClosed is returned when a function attempts to manipulate a Store 19 | // that is not currently open. 20 | ErrClosed = errors.New("torus: store is closed") 21 | 22 | // ErrInvalid is a locally invalid operation (such as Close()ing a nil file pointer) 23 | ErrInvalid = errors.New("torus: invalid operation") 24 | 25 | // ErrOutOfSpace is returned when the block storage is out of space. 26 | ErrOutOfSpace = errors.New("torus: out of space on block store") 27 | 28 | // ErrExists is returned if the entity already exists 29 | ErrExists = errors.New("torus: already exists") 30 | 31 | // ErrNotExist is returned if the entity doesn't already exist 32 | ErrNotExist = errors.New("torus: doesn't exist") 33 | 34 | // ErrAgain is returned if the operation was interrupted. The call was valid, and 35 | // may be tried again. 36 | ErrAgain = errors.New("torus: interrupted, try again") 37 | 38 | // ErrNoGlobalMetadata is returned if the metadata service hasn't been formatted. 39 | ErrNoGlobalMetadata = errors.New("torus: no global metadata available at mds") 40 | 41 | // ErrNonSequentialRing is returned if the ring's internal version number appears to jump. 42 | ErrNonSequentialRing = errors.New("torus: non-sequential ring") 43 | 44 | // ErrNoPeer is returned if the peer can't be found. 45 | ErrNoPeer = errors.New("torus: no such peer") 46 | 47 | // ErrCompareFailed is returned if the CAS operation failed to compare. 48 | ErrCompareFailed = errors.New("torus: compare failed") 49 | 50 | // ErrIsSymlink is returned if we're trying to modify a symlink incorrectly. 51 | ErrIsSymlink = errors.New("torus: is symlink") 52 | 53 | // ErrNotDir is returned if we're trying a directory operation on a non-directory path. 54 | ErrNotDir = errors.New("torus: not a directory") 55 | 56 | // ErrWrongVolumeType is returned if the operation cannot be performed on this type of volume. 57 | ErrWrongVolumeType = errors.New("torus: wrong volume type") 58 | 59 | // ErrNotSupported is returned if the interface doesn't implement the 60 | // requested subfunctionality. 61 | ErrNotSupported = errors.New("torus: not supported") 62 | 63 | // ErrLocked is returned if the resource is locked. 64 | ErrLocked = errors.New("torus: locked") 65 | 66 | // ErrLeaseNotFound is returned if the lease cannot be found. 67 | ErrLeaseNotFound = errors.New("torus: lease not found") 68 | 69 | // ErrUsage is returned if the command usage is wrong. 70 | ErrUsage = errors.New("torus: wrong command usage") 71 | ) 72 | -------------------------------------------------------------------------------- /file_blackbox_test.go: -------------------------------------------------------------------------------- 1 | package torus_test 2 | 3 | import ( 4 | "bytes" 5 | "crypto/rand" 6 | "io" 7 | "io/ioutil" 8 | "testing" 9 | 10 | "github.com/coreos/torus" 11 | "github.com/coreos/torus/blockset" 12 | "github.com/coreos/torus/models" 13 | 14 | _ "github.com/coreos/torus/metadata/temp" 15 | _ "github.com/coreos/torus/storage" 16 | ) 17 | 18 | func makeTestData(size int) []byte { 19 | out := make([]byte, size) 20 | _, err := rand.Read(out) 21 | if err != nil { 22 | panic(err) 23 | } 24 | return out 25 | } 26 | 27 | func makeFile(name string, t *testing.T) (*torus.Server, *torus.File) { 28 | srv := torus.NewMemoryServer() 29 | vol := &models.Volume{ 30 | Name: name, 31 | Id: 3, 32 | Type: "test", 33 | } 34 | globals := srv.MDS.GlobalMetadata() 35 | bs, err := blockset.CreateBlocksetFromSpec(globals.DefaultBlockSpec, srv.Blocks) 36 | if err != nil { 37 | t.Fatal(err) 38 | } 39 | inode := models.NewEmptyINode() 40 | inode.INode = 1 41 | inode.Volume = vol.Id 42 | inode.Blocks, err = torus.MarshalBlocksetToProto(bs) 43 | f, err := srv.CreateFile(vol, inode, bs) 44 | if err != nil { 45 | t.Fatal(err) 46 | } 47 | return srv, f 48 | } 49 | 50 | func newFile(name string, t *testing.T) *torus.File { 51 | _, f := makeFile(name, t) 52 | return f 53 | } 54 | 55 | func TestReadAt(t *testing.T) { 56 | f := newFile("TestReadAt", t) 57 | defer f.Close() 58 | 59 | const data = "hello, world\n" 60 | io.WriteString(f, data) 61 | 62 | b := make([]byte, 5) 63 | n, err := f.ReadAt(b, 7) 64 | if err != nil || n != len(b) { 65 | t.Fatalf("ReadAt 7: %d, %v", n, err) 66 | } 67 | if string(b) != "world" { 68 | t.Fatalf("ReadAt 7: have %q want %q", string(b), "world") 69 | } 70 | _, err = f.SyncAllWrites() 71 | if err != nil { 72 | t.Fatalf("error on sync: %v", err) 73 | } 74 | b = make([]byte, 5) 75 | n, err = f.ReadAt(b, 7) 76 | if err != nil || n != len(b) { 77 | t.Fatalf("ReadAt 7: %d, %v", n, err) 78 | } 79 | if string(b) != "world" { 80 | t.Fatalf("ReadAt 7: have %q want %q", string(b), "world") 81 | } 82 | } 83 | 84 | func TestReadAtOffset(t *testing.T) { 85 | f := newFile("TestReadAtOffset", t) 86 | defer f.Close() 87 | 88 | const data = "hello, world\n" 89 | io.WriteString(f, data) 90 | 91 | f.Seek(0, 0) 92 | b := make([]byte, 5) 93 | 94 | n, err := f.ReadAt(b, 7) 95 | if err != nil || n != len(b) { 96 | t.Fatalf("ReadAt 7: %d, %v", n, err) 97 | } 98 | if string(b) != "world" { 99 | t.Fatalf("ReadAt 7: have %q want %q", string(b), "world") 100 | } 101 | 102 | n, err = f.Read(b) 103 | if err != nil || n != len(b) { 104 | t.Fatalf("Read: %d, %v", n, err) 105 | } 106 | if string(b) != "hello" { 107 | t.Fatalf("Read: have %q want %q", string(b), "hello") 108 | } 109 | } 110 | 111 | func TestWriteAt(t *testing.T) { 112 | f := newFile("TestWriteAt", t) 113 | defer f.Close() 114 | 115 | const data = "hello, world\n" 116 | io.WriteString(f, data) 117 | 118 | n, err := f.WriteAt([]byte("WORLD"), 7) 119 | if err != nil || n != 5 { 120 | t.Fatalf("WriteAt 7: %d, %v", n, err) 121 | } 122 | f.Seek(0, 0) 123 | 124 | b, err := ioutil.ReadAll(f) 125 | if err != nil { 126 | t.Fatalf("ReadAll %v: %v", f, err) 127 | } 128 | if string(b) != "hello, WORLD\n" { 129 | t.Fatalf("after write: have %q want %q", string(b), "hello, WORLD\n") 130 | } 131 | } 132 | 133 | func TestWriteAtBulk(t *testing.T) { 134 | f := newFile("TestWriteAt", t) 135 | defer f.Close() 136 | 137 | const data = "hello, world\n" 138 | io.WriteString(f, data) 139 | 140 | _, err := f.SyncAllWrites() 141 | if err != nil { 142 | t.Fatalf("can't sync: %v", err) 143 | } 144 | 145 | // Odd-shape to stress block edges. 146 | big := makeTestData(549) 147 | n, err := f.WriteAt(big, 7) 148 | if err != nil || n != len(big) { 149 | t.Fatalf("WriteAt 7: %d, %v", n, err) 150 | } 151 | f.Seek(0, 0) 152 | 153 | b, err := ioutil.ReadAll(f) 154 | if err != nil { 155 | t.Fatalf("ReadAll %v: %v", f, err) 156 | } 157 | if string(b[:6]) != "hello," { 158 | t.Fatalf("after write: have %q want %q", string(b), "hello,") 159 | } 160 | if !bytes.Equal(b[7:], big) { 161 | t.Fatal("byte strings aren't equal") 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /file_cache.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "time" 5 | 6 | "golang.org/x/net/context" 7 | ) 8 | 9 | type fileCache interface { 10 | newINode(ref INodeRef) 11 | writeToBlock(ctx context.Context, i, from, to int, data []byte) (int, error) 12 | getBlock(ctx context.Context, i int) ([]byte, error) 13 | sync(context.Context) error 14 | } 15 | 16 | type singleBlockCache struct { 17 | // half-finished blocks 18 | openIdx int 19 | openData []byte 20 | openWrote bool 21 | 22 | ref INodeRef 23 | 24 | blocks Blockset 25 | 26 | readIdx int 27 | readData []byte 28 | 29 | blkSize uint64 30 | } 31 | 32 | func newSingleBlockCache(bs Blockset, blkSize uint64) *singleBlockCache { 33 | return &singleBlockCache{ 34 | readIdx: -1, 35 | openIdx: -1, 36 | blocks: bs, 37 | blkSize: blkSize, 38 | } 39 | } 40 | 41 | func (sb *singleBlockCache) newINode(ref INodeRef) { 42 | sb.ref = ref 43 | } 44 | 45 | func (sb *singleBlockCache) openBlock(ctx context.Context, i int) error { 46 | if sb.openIdx == i && sb.openData != nil { 47 | return nil 48 | } 49 | if sb.openWrote { 50 | err := sb.sync(ctx) 51 | if err != nil { 52 | return err 53 | } 54 | } 55 | if i == sb.blocks.Length() { 56 | sb.openData = make([]byte, sb.blkSize) 57 | sb.openIdx = i 58 | return nil 59 | } 60 | if i > sb.blocks.Length() { 61 | panic("writing beyond the end of a file without calling Truncate") 62 | } 63 | 64 | if sb.readIdx == i { 65 | sb.openIdx = i 66 | sb.openData = sb.readData 67 | sb.readData = nil 68 | sb.readIdx = -1 69 | return nil 70 | } 71 | start := time.Now() 72 | d, err := sb.blocks.GetBlock(ctx, i) 73 | if err != nil { 74 | return err 75 | } 76 | delta := time.Since(start) 77 | promFileBlockRead.Observe(float64(delta.Nanoseconds()) / 1000) 78 | sb.openData = d 79 | sb.openIdx = i 80 | return nil 81 | } 82 | 83 | func (sb *singleBlockCache) writeToBlock(ctx context.Context, i, from, to int, data []byte) (int, error) { 84 | if sb.openIdx != i { 85 | err := sb.openBlock(ctx, i) 86 | if err != nil { 87 | return 0, err 88 | } 89 | } 90 | sb.openWrote = true 91 | if (to - from) != len(data) { 92 | panic("server: different write lengths?") 93 | } 94 | return copy(sb.openData[from:to], data), nil 95 | } 96 | 97 | func (sb *singleBlockCache) sync(ctx context.Context) error { 98 | if !sb.openWrote { 99 | return nil 100 | } 101 | start := time.Now() 102 | err := sb.blocks.PutBlock(ctx, sb.ref, sb.openIdx, sb.openData) 103 | delta := time.Since(start) 104 | promFileBlockWrite.Observe(float64(delta.Nanoseconds()) / 1000) 105 | sb.openWrote = false 106 | return err 107 | } 108 | 109 | func (sb *singleBlockCache) openRead(ctx context.Context, i int) error { 110 | start := time.Now() 111 | d, err := sb.blocks.GetBlock(ctx, i) 112 | if err != nil { 113 | return err 114 | } 115 | delta := time.Since(start) 116 | promFileBlockRead.Observe(float64(delta.Nanoseconds()) / 1000) 117 | sb.readData = d 118 | sb.readIdx = i 119 | return nil 120 | } 121 | 122 | func (sb *singleBlockCache) getBlock(ctx context.Context, i int) ([]byte, error) { 123 | if sb.openIdx == i { 124 | return sb.openData, nil 125 | } 126 | if sb.readIdx != i { 127 | err := sb.openRead(ctx, i) 128 | if err != nil { 129 | return nil, err 130 | } 131 | } 132 | return sb.readData, nil 133 | } 134 | -------------------------------------------------------------------------------- /gc/gc.go: -------------------------------------------------------------------------------- 1 | // gc provides the Torus interface for how garbage collection is implemented. 2 | // Volumes implement a garbage collector and the controller in this package runs 3 | // them. 4 | package gc 5 | 6 | import ( 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/models" 10 | "golang.org/x/net/context" 11 | ) 12 | 13 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "gc") 14 | 15 | type controller struct { 16 | gcs []GC 17 | } 18 | 19 | type GC interface { 20 | PrepVolume(*models.Volume) error 21 | IsDead(torus.BlockRef) bool 22 | Clear() 23 | } 24 | 25 | type INodeFetcher interface { 26 | GetINode(context.Context, torus.INodeRef) (*models.INode, error) 27 | } 28 | 29 | func NewGCController(srv *torus.Server, inodes INodeFetcher) GC { 30 | var gcs []GC 31 | for k, v := range gcFuncs { 32 | clog.Debugf("creating %s gc", k) 33 | gc, err := v(srv, inodes) 34 | if err != nil { 35 | clog.Errorf("cannot create gc %s: %v", k, err) 36 | continue 37 | } 38 | gcs = append(gcs, gc) 39 | } 40 | return &controller{ 41 | gcs: gcs, 42 | } 43 | } 44 | 45 | func (c *controller) PrepVolume(vol *models.Volume) error { 46 | n := 0 47 | for _, x := range c.gcs { 48 | n++ 49 | err := x.PrepVolume(vol) 50 | if err != nil { 51 | return err 52 | } 53 | } 54 | return nil 55 | } 56 | 57 | func (c *controller) IsDead(ref torus.BlockRef) bool { 58 | for _, x := range c.gcs { 59 | if x.IsDead(ref) { 60 | return true 61 | } 62 | } 63 | return false 64 | } 65 | 66 | func (c *controller) Clear() { 67 | for _, x := range c.gcs { 68 | x.Clear() 69 | } 70 | } 71 | 72 | type CreateGCFunc func(srv *torus.Server, inodes INodeFetcher) (GC, error) 73 | 74 | var gcFuncs map[string]CreateGCFunc 75 | 76 | func RegisterGC(name string, newFunc CreateGCFunc) { 77 | if gcFuncs == nil { 78 | gcFuncs = make(map[string]CreateGCFunc) 79 | } 80 | 81 | if _, ok := gcFuncs[name]; ok { 82 | panic("gc: attempted to register GC " + name + " twice") 83 | } 84 | 85 | gcFuncs[name] = newFunc 86 | } 87 | -------------------------------------------------------------------------------- /gc/null.go: -------------------------------------------------------------------------------- 1 | package gc 2 | 3 | import ( 4 | "github.com/coreos/torus" 5 | "github.com/coreos/torus/models" 6 | ) 7 | 8 | type NullGC struct{} 9 | 10 | func (n *NullGC) PrepVolume(_ *models.Volume) error { return nil } 11 | func (n *NullGC) IsDead(ref torus.BlockRef) bool { return false } 12 | func (n *NullGC) Clear() {} 13 | -------------------------------------------------------------------------------- /glide.yaml: -------------------------------------------------------------------------------- 1 | package: github.com/coreos/torus 2 | import: 3 | - package: github.com/DeanThompson/ginpprof 4 | - package: github.com/RoaringBitmap/roaring 5 | - package: github.com/barakmich/mmap-go 6 | - package: github.com/coreos/etcd 7 | subpackages: 8 | - clientv3 9 | - package: github.com/coreos/go-systemd 10 | subpackages: 11 | - dbus 12 | - unit 13 | - package: github.com/coreos/pkg 14 | subpackages: 15 | - capnslog 16 | - progressutil 17 | - package: github.com/dustin/go-humanize 18 | - package: github.com/gin-gonic/gin 19 | - package: github.com/godbus/dbus 20 | - package: github.com/gogo/protobuf 21 | subpackages: 22 | - gogoproto 23 | - proto 24 | - package: github.com/kardianos/osext 25 | - package: github.com/mdlayher/aoe 26 | - package: github.com/mdlayher/ethernet 27 | - package: github.com/mdlayher/raw 28 | - package: github.com/pborman/uuid 29 | - package: github.com/prometheus/client_golang 30 | subpackages: 31 | - prometheus 32 | - package: github.com/ricochet2200/go-disk-usage 33 | - package: github.com/serialx/hashring 34 | - package: github.com/spf13/cobra 35 | - package: golang.org/x/net 36 | version: master 37 | repo: https://go.googlesource.com/net 38 | subpackages: 39 | - http2 40 | - lex/httplex 41 | - context 42 | - bpf 43 | - trace 44 | - http2/hpack 45 | - internal/timeseries 46 | - package: google.golang.org/grpc 47 | - package: github.com/coreos/go-tcmu 48 | - package: github.com/lpabon/godbc 49 | -------------------------------------------------------------------------------- /heartbeat.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "net/url" 7 | "time" 8 | 9 | "github.com/coreos/torus/models" 10 | "github.com/prometheus/client_golang/prometheus" 11 | 12 | "golang.org/x/net/context" 13 | ) 14 | 15 | const ( 16 | currentProtocolVersion = 1 17 | minProtocolVersion = 0 18 | 19 | heartbeatTimeout = 1 * time.Second 20 | heartbeatInterval = 5 * time.Second 21 | ) 22 | 23 | var ( 24 | promHeartbeats = prometheus.NewCounter(prometheus.CounterOpts{ 25 | Name: "torus_server_heartbeats", 26 | Help: "Number of times this server has heartbeated to mds", 27 | }) 28 | promServerPeers = prometheus.NewGauge(prometheus.GaugeOpts{ 29 | Name: "torus_server_peers_total", 30 | Help: "Number of peers this server sees", 31 | }) 32 | ) 33 | 34 | func init() { 35 | prometheus.MustRegister(promHeartbeats) 36 | prometheus.MustRegister(promServerPeers) 37 | } 38 | 39 | // BeginHeartbeat spawns a goroutine for heartbeats. Non-blocking. 40 | func (s *Server) BeginHeartbeat(addr *url.URL) error { 41 | if s.heartbeating { 42 | return nil 43 | } 44 | 45 | // Test the cluster's version on startup. 46 | peers := s.UpdatePeerMap() 47 | for uuid, p := range peers { 48 | if p.ProtocolVersion < minProtocolVersion { 49 | // Fail to start. 50 | return fmt.Errorf("cluster too old: peer %s has protocol version %d (minimum is %d, current is %d)", uuid, p.ProtocolVersion, minProtocolVersion, currentProtocolVersion) 51 | } 52 | } 53 | 54 | // Update our data. 55 | s.peerInfo.ProtocolVersion = currentProtocolVersion 56 | if addr != nil { 57 | ipaddr, port, err := net.SplitHostPort(addr.Host) 58 | if err != nil { 59 | return err 60 | } 61 | ipaddr = autodetectIP(ipaddr) 62 | advertiseURI := *addr 63 | advertiseURI.Host = ipaddr 64 | if port != "" { 65 | advertiseURI.Host = fmt.Sprintf("%s:%s", ipaddr, port) 66 | } 67 | s.peerInfo.Address = advertiseURI.String() 68 | } 69 | var err error 70 | err = s.createOrRenewLease(context.Background()) 71 | if err != nil { 72 | return err 73 | } 74 | s.UpdateRebalanceInfo(&models.RebalanceInfo{}) 75 | ch := make(chan interface{}) 76 | s.closeChans = append(s.closeChans, ch) 77 | go s.heartbeat(ch) 78 | s.heartbeating = true 79 | return nil 80 | } 81 | 82 | func (s *Server) heartbeat(cl chan interface{}) { 83 | for { 84 | s.oneHeartbeat() 85 | select { 86 | case <-cl: 87 | // TODO(barakmich): Clean up. 88 | return 89 | case <-time.After(heartbeatInterval): 90 | clog.Trace("heartbeating again") 91 | } 92 | } 93 | } 94 | 95 | func (s *Server) AddTimeoutCallback(f func(uuid string)) { 96 | s.timeoutCallbacks = append(s.timeoutCallbacks, f) 97 | } 98 | 99 | func (s *Server) oneHeartbeat() { 100 | promHeartbeats.Inc() 101 | 102 | s.mut.Lock() 103 | s.peerInfo.TotalBlocks = s.Blocks.NumBlocks() 104 | s.peerInfo.UsedBlocks = s.Blocks.UsedBlocks() 105 | s.mut.Unlock() 106 | 107 | ctx, cancel := context.WithTimeout(context.Background(), heartbeatTimeout) 108 | defer cancel() 109 | err := s.createOrRenewLease(ctx) 110 | if err != nil { 111 | clog.Warningf("failed to create or renew lease: %s", err) 112 | } 113 | s.infoMut.Lock() 114 | defer s.infoMut.Unlock() 115 | err = s.MDS.WithContext(ctx).RegisterPeer(s.lease, s.peerInfo) 116 | if err != nil { 117 | clog.Warningf("couldn't register heartbeat: %s", err) 118 | } 119 | s.updatePeerMap() 120 | } 121 | 122 | func (s *Server) updatePeerMap() { 123 | ctxget, cancelget := context.WithTimeout(context.Background(), heartbeatTimeout) 124 | defer cancelget() 125 | peers, err := s.MDS.WithContext(ctxget).GetPeers() 126 | if err != nil { 127 | clog.Warningf("couldn't update peerlist: %s", err) 128 | return 129 | } 130 | promServerPeers.Set(float64(len(peers))) 131 | 132 | s.mut.Lock() 133 | defer s.mut.Unlock() 134 | 135 | for _, p := range peers { 136 | s.peersMap[p.UUID] = p 137 | } 138 | for k := range s.peersMap { 139 | found := false 140 | for _, p := range peers { 141 | if p.UUID == k { 142 | found = true 143 | break 144 | } 145 | } 146 | if !found { 147 | for _, f := range s.timeoutCallbacks { 148 | f(k) 149 | } 150 | s.peersMap[k].TimedOut = true 151 | } 152 | } 153 | } 154 | 155 | func (s *Server) UpdatePeerMap() map[string]*models.PeerInfo { 156 | s.updatePeerMap() 157 | return s.GetPeerMap() 158 | } 159 | 160 | func (s *Server) UpdateRebalanceInfo(ri *models.RebalanceInfo) { 161 | s.infoMut.Lock() 162 | defer s.infoMut.Unlock() 163 | s.peerInfo.RebalanceInfo = ri 164 | } 165 | 166 | func autodetectIP(ip string) string { 167 | // We can't advertise "all IPs" 168 | if ip != "0.0.0.0" { 169 | return ip 170 | } 171 | addrs, err := net.InterfaceAddrs() 172 | if err != nil { 173 | panic(err) 174 | } 175 | for _, a := range addrs { 176 | if ipnet, ok := a.(*net.IPNet); ok && !ipnet.IP.IsLoopback() { 177 | if ipnet.IP.To4() != nil { 178 | return ipnet.IP.String() 179 | } 180 | } 181 | } 182 | // Just do localhost. 183 | return "127.0.0.1" 184 | } 185 | -------------------------------------------------------------------------------- /inode.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "encoding/binary" 5 | 6 | "github.com/coreos/pkg/capnslog" 7 | "github.com/coreos/torus/models" 8 | "github.com/prometheus/client_golang/prometheus" 9 | "golang.org/x/net/context" 10 | ) 11 | 12 | var ( 13 | // INodes 14 | promINodeRequests = prometheus.NewCounter(prometheus.CounterOpts{ 15 | Name: "torus_distributor_inode_requests_total", 16 | Help: "Total number of inodes requested of the distributor layer", 17 | }) 18 | promINodeFailures = prometheus.NewCounter(prometheus.CounterOpts{ 19 | Name: "torus_distributor_inode_request_failures", 20 | Help: "Number of failed inode requests", 21 | }) 22 | ) 23 | 24 | func init() { 25 | prometheus.MustRegister(promINodeRequests) 26 | prometheus.MustRegister(promINodeFailures) 27 | } 28 | 29 | type INodeStore struct { 30 | bs BlockStore 31 | name string 32 | } 33 | 34 | func NewINodeStore(bs BlockStore) *INodeStore { 35 | return &INodeStore{ 36 | bs: bs, 37 | } 38 | } 39 | 40 | func (b *INodeStore) Flush() error { return b.bs.Flush() } 41 | func (b *INodeStore) Close() error { 42 | return b.bs.Close() 43 | } 44 | 45 | func (b *INodeStore) WriteINode(ctx context.Context, i INodeRef, inode *models.INode) error { 46 | if i.INode == 0 { 47 | panic("Writing zero inode") 48 | } 49 | inodedata, err := inode.Marshal() 50 | if err != nil { 51 | return err 52 | } 53 | buf := make([]byte, b.bs.BlockSize()) 54 | binary.LittleEndian.PutUint32(buf[0:4], uint32(len(inodedata))) 55 | bufoffset := 4 56 | inodeoffset := 0 57 | index := 1 58 | for inodeoffset != len(inodedata) { 59 | if bufoffset == 0 { 60 | buf = make([]byte, b.bs.BlockSize()) 61 | } 62 | written := copy(buf[bufoffset:], inodedata[inodeoffset:]) 63 | inodeoffset += written 64 | ref := BlockRef{ 65 | INodeRef: i, 66 | Index: IndexID(index), 67 | } 68 | ref.SetBlockType(TypeINode) 69 | if BlockLog.LevelAt(capnslog.TRACE) { 70 | BlockLog.Tracef("writing inode block: %s", ref) 71 | } 72 | err := b.bs.WriteBlock(ctx, ref, buf) 73 | if err != nil { 74 | return err 75 | } 76 | bufoffset = 0 77 | index++ 78 | } 79 | clog.Tracef("Wrote INode %s", i) 80 | return nil 81 | } 82 | 83 | func (b *INodeStore) GetINode(ctx context.Context, i INodeRef) (*models.INode, error) { 84 | if i.INode == 0 { 85 | panic("Fetching zero inode") 86 | } 87 | promINodeRequests.Inc() 88 | index := 1 89 | ref := BlockRef{ 90 | INodeRef: i, 91 | Index: IndexID(index), 92 | } 93 | ref.SetBlockType(TypeINode) 94 | data, err := b.bs.GetBlock(ctx, ref) 95 | if err != nil { 96 | promINodeFailures.Inc() 97 | return nil, err 98 | } 99 | dlen := binary.LittleEndian.Uint32(data[0:4]) 100 | buf := make([]byte, dlen) 101 | bufoffset := 0 102 | dataoffset := 4 103 | for bufoffset != int(dlen) { 104 | if dataoffset == 0 { 105 | index++ 106 | ref := BlockRef{ 107 | INodeRef: i, 108 | Index: IndexID(index), 109 | } 110 | ref.SetBlockType(TypeINode) 111 | data, err = b.bs.GetBlock(ctx, ref) 112 | if err != nil { 113 | promINodeFailures.Inc() 114 | clog.Errorf("inode: couldn't get inode block: %s -- %s", err, ref) 115 | return nil, err 116 | } 117 | } 118 | written := copy(buf[bufoffset:], data[dataoffset:]) 119 | dataoffset = 0 120 | bufoffset += written 121 | } 122 | out := &models.INode{} 123 | err = out.Unmarshal(buf) 124 | if err != nil { 125 | promINodeFailures.Inc() 126 | clog.Errorf("inode: couldn't unmarshal: %s", err) 127 | return nil, err 128 | } 129 | return out, nil 130 | } 131 | 132 | func (b *INodeStore) DeleteINode(ctx context.Context, i INodeRef) error { 133 | if i.INode == 0 { 134 | panic("Deleting zero inode") 135 | } 136 | ref := BlockRef{ 137 | INodeRef: i, 138 | Index: IndexID(1), 139 | } 140 | ref.SetBlockType(TypeINode) 141 | data, err := b.bs.GetBlock(ctx, ref) 142 | if err != nil { 143 | return err 144 | } 145 | dlen := binary.LittleEndian.Uint32(data[0:4]) 146 | nblocks := (uint64(dlen) / b.bs.BlockSize()) + 1 147 | for j := uint64(1); j <= nblocks; j++ { 148 | ref := BlockRef{ 149 | INodeRef: i, 150 | Index: IndexID(j), 151 | } 152 | ref.SetBlockType(TypeINode) 153 | err := b.bs.DeleteBlock(ctx, ref) 154 | if err != nil { 155 | return err 156 | } 157 | } 158 | return nil 159 | } 160 | 161 | func (b *INodeStore) INodeIterator() *INodeIterator { 162 | it := b.bs.BlockIterator() 163 | return &INodeIterator{it} 164 | } 165 | 166 | type INodeIterator struct { 167 | it BlockIterator 168 | } 169 | 170 | func (i *INodeIterator) Err() error { return i.it.Err() } 171 | func (i *INodeIterator) Next() bool { 172 | for i.it.Next() { 173 | ref := i.it.BlockRef() 174 | if ref.BlockType() == TypeINode && ref.Index == 1 { 175 | return true 176 | } 177 | } 178 | return false 179 | } 180 | 181 | func (i *INodeIterator) INodeRef() INodeRef { 182 | return i.it.BlockRef().INodeRef 183 | } 184 | 185 | func (i *INodeIterator) Close() error { 186 | return i.it.Close() 187 | } 188 | -------------------------------------------------------------------------------- /internal/http/api.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/DeanThompson/ginpprof" 7 | "github.com/coreos/torus" 8 | "github.com/gin-gonic/gin" 9 | "github.com/prometheus/client_golang/prometheus" 10 | ) 11 | 12 | type Server struct { 13 | router *gin.Engine 14 | dfs *torus.Server 15 | promHandler http.Handler 16 | } 17 | 18 | func NewServer(dfs *torus.Server) *Server { 19 | engine := gin.New() 20 | engine.Use(gin.Recovery()) 21 | s := &Server{ 22 | router: engine, 23 | dfs: dfs, 24 | promHandler: prometheus.Handler(), 25 | } 26 | s.setupRoutes() 27 | return s 28 | } 29 | 30 | func (s *Server) setupRoutes() { 31 | s.router.GET("/metrics", s.prometheus) 32 | ginpprof.Wrapper(s.router) 33 | } 34 | 35 | func (s *Server) prometheus(c *gin.Context) { 36 | s.promHandler.ServeHTTP(c.Writer, c.Request) 37 | } 38 | 39 | func ServeHTTP(addr string, srv *torus.Server) error { 40 | return NewServer(srv).router.Run(addr) 41 | } 42 | 43 | func (s *Server) Run(addr string) error { 44 | return s.router.Run(addr) 45 | } 46 | -------------------------------------------------------------------------------- /internal/nbd/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Andreas Klauer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /internal/nbd/README.md: -------------------------------------------------------------------------------- 1 | nbd 2 | === 3 | 4 | Golang Linux Network Block Device as a local block device in user space. 5 | -------------------------------------------------------------------------------- /internal/tcmu/commands.go: -------------------------------------------------------------------------------- 1 | package torustcmu 2 | 3 | import ( 4 | "github.com/coreos/go-tcmu" 5 | "github.com/coreos/go-tcmu/scsi" 6 | ) 7 | 8 | func (h *torusHandler) handleSyncCommand(cmd *tcmu.SCSICmd) (tcmu.SCSIResponse, error) { 9 | clog.Debugf("syncing") 10 | err := h.file.Sync() 11 | if err != nil { 12 | clog.Errorf("sync failed: %v", err.Error()) 13 | return cmd.MediumError(), nil 14 | } 15 | return cmd.Ok(), nil 16 | } 17 | 18 | func (h *torusHandler) handleReportDeviceID(cmd *tcmu.SCSICmd) (tcmu.SCSIResponse, error) { 19 | v := h.name 20 | // The SCSI spec only allows lengths representable in one byte (byte 3). We 21 | // also need some overhead; reporting the device came from Torus Therefore, we 22 | // need to truncate the length of the name to something less than 255. Let's 23 | // truncate it to 240. 24 | if len(h.name) > 240 { 25 | v = v[:240] 26 | } 27 | name := []byte("torus:" + v) 28 | data := make([]byte, 4+len(name)) 29 | data[3] = byte(len(name)) 30 | copy(data[4:], name) 31 | n, err := cmd.Write(data) 32 | if err != nil { 33 | clog.Errorf("reportDeviceID failed: %v", err) 34 | return cmd.MediumError(), nil 35 | } 36 | if n < len(data) { 37 | clog.Error("reportDeviceID failed: unable to copy enough data") 38 | return cmd.MediumError(), nil 39 | } 40 | return cmd.Ok(), nil 41 | } 42 | 43 | func (h *torusHandler) handleWrite(cmd *tcmu.SCSICmd) (tcmu.SCSIResponse, error) { 44 | offset := cmd.LBA() * uint64(cmd.Device().Sizes().BlockSize) 45 | length := int(cmd.XferLen() * uint32(cmd.Device().Sizes().BlockSize)) 46 | if cmd.Buf == nil { 47 | cmd.Buf = make([]byte, length) 48 | } 49 | if len(cmd.Buf) < int(length) { 50 | // Realloc; the buffer can be reused. See io.Copy() in the 51 | // stdlib for precedent. 52 | cmd.Buf = make([]byte, length) 53 | } 54 | n, err := cmd.Read(cmd.Buf[:int(length)]) 55 | if n < length { 56 | clog.Error("write/read failed: unable to copy enough") 57 | return cmd.MediumError(), nil 58 | } 59 | if err != nil { 60 | clog.Errorf("write/read failed: error: %v", err) 61 | return cmd.MediumError(), nil 62 | } 63 | n, err = h.file.WriteAt(cmd.Buf[:length], int64(offset)) 64 | if n < length { 65 | clog.Error("write/write failed: unable to copy enough") 66 | return cmd.MediumError(), nil 67 | } 68 | if err != nil { 69 | clog.Errorf("write/write failed: error: %v", err) 70 | return cmd.MediumError(), nil 71 | } 72 | if cmd.Command() != scsi.Write6 { 73 | cdbinfo := cmd.GetCDB(1) 74 | // Write10/Write12 CDB 1 is defined as follows: 75 | // Bits ---> 76 | // | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | 77 | // | WRPROTECT | DPO | FUA | resvd | FUANV | resvd | 78 | // So 0x08 represents the Force Unit Access bit being set, which, by 79 | // spec, requires a sync and not to return until it's been written. 80 | if cdbinfo&0x08 != 0 { 81 | // FUA is set 82 | err = h.file.Sync() 83 | if err != nil { 84 | clog.Errorf("sync failed: %v", err) 85 | return cmd.MediumError(), nil 86 | } 87 | } 88 | } 89 | return cmd.Ok(), nil 90 | } 91 | -------------------------------------------------------------------------------- /internal/tcmu/connect.go: -------------------------------------------------------------------------------- 1 | package torustcmu 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/coreos/go-tcmu" 7 | "github.com/coreos/go-tcmu/scsi" 8 | "github.com/coreos/pkg/capnslog" 9 | "github.com/coreos/torus/block" 10 | ) 11 | 12 | const ( 13 | defaultBlockSize = 4 * 1024 14 | devPath = "/dev/torus" 15 | ) 16 | 17 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "tcmu") 18 | 19 | func ConnectAndServe(f *block.BlockFile, name string, closer chan bool) error { 20 | wwn := tcmu.NaaWWN{ 21 | // TODO(barakmich): CoreOS OUI here 22 | OUI: "000000", 23 | VendorID: tcmu.GenerateSerial(name), 24 | } 25 | h := &tcmu.SCSIHandler{ 26 | HBA: 30, 27 | LUN: 0, 28 | WWN: wwn, 29 | VolumeName: name, 30 | // 1GiB, 1K 31 | DataSizes: tcmu.DataSizes{ 32 | VolumeSize: int64(f.Size()), 33 | BlockSize: defaultBlockSize, 34 | }, 35 | DevReady: tcmu.MultiThreadedDevReady( 36 | &torusHandler{ 37 | file: f, 38 | name: name, 39 | inq: &tcmu.InquiryInfo{ 40 | VendorID: "CoreOS", 41 | ProductID: "TorusBlk", 42 | ProductRev: "0001", 43 | }, 44 | }, 1), 45 | } 46 | d, err := tcmu.OpenTCMUDevice(devPath, h) 47 | if err != nil { 48 | return err 49 | } 50 | defer d.Close() 51 | fmt.Printf("Attached to %s/%s. Server loop begins ... \n", devPath, name) 52 | <-closer 53 | return nil 54 | } 55 | 56 | type torusHandler struct { 57 | file *block.BlockFile 58 | name string 59 | inq *tcmu.InquiryInfo 60 | } 61 | 62 | func (h *torusHandler) HandleCommand(cmd *tcmu.SCSICmd) (tcmu.SCSIResponse, error) { 63 | switch cmd.Command() { 64 | case scsi.Inquiry: 65 | return tcmu.EmulateInquiry(cmd, h.inq) 66 | case scsi.TestUnitReady: 67 | return tcmu.EmulateTestUnitReady(cmd) 68 | case scsi.ServiceActionIn16: 69 | return tcmu.EmulateServiceActionIn(cmd) 70 | case scsi.ModeSense, scsi.ModeSense10: 71 | return tcmu.EmulateModeSense(cmd, true) 72 | case scsi.ModeSelect, scsi.ModeSelect10: 73 | return tcmu.EmulateModeSelect(cmd, true) 74 | case scsi.Read6, scsi.Read10, scsi.Read12, scsi.Read16: 75 | return tcmu.EmulateRead(cmd, h.file) 76 | case scsi.Write6, scsi.Write10, scsi.Write12, scsi.Write16: 77 | return h.handleWrite(cmd) 78 | case scsi.SynchronizeCache, scsi.SynchronizeCache16: 79 | return h.handleSyncCommand(cmd) 80 | case scsi.MaintenanceIn: 81 | return h.handleReportDeviceID(cmd) 82 | default: 83 | clog.Debugf("Ignore unknown SCSI command 0x%x\n", cmd.Command()) 84 | } 85 | return cmd.NotHandled(), nil 86 | } 87 | -------------------------------------------------------------------------------- /local_server.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | 7 | "github.com/coreos/torus/models" 8 | ) 9 | 10 | func MkdirsFor(dir string) error { 11 | if dir == "" { 12 | return nil 13 | } 14 | err := os.MkdirAll(dir, 0700) 15 | if err != nil { 16 | return err 17 | } 18 | err = os.MkdirAll(filepath.Join(dir, "metadata"), 0700) 19 | if err != nil { 20 | return err 21 | } 22 | err = os.MkdirAll(filepath.Join(dir, "block"), 0700) 23 | if err != nil { 24 | return err 25 | } 26 | return nil 27 | } 28 | 29 | func NewServer(cfg Config, metadataServiceKind, blockStoreKind string) (*Server, error) { 30 | err := MkdirsFor(cfg.DataDir) 31 | if err != nil { 32 | return nil, err 33 | } 34 | 35 | mds, err := CreateMetadataService(metadataServiceKind, cfg) 36 | if err != nil { 37 | return nil, err 38 | } 39 | 40 | global := mds.GlobalMetadata() 41 | 42 | blocks, err := CreateBlockStore(blockStoreKind, "current", cfg, global) 43 | if err != nil { 44 | return nil, err 45 | } 46 | return NewServerByImpl(cfg, mds, blocks) 47 | } 48 | 49 | func NewMemoryServer() *Server { 50 | cfg := Config{ 51 | StorageSize: 100 * 1024 * 1024, 52 | } 53 | x, err := NewServer(cfg, "temp", "temp") 54 | if err != nil { 55 | panic(err) 56 | } 57 | return x 58 | } 59 | 60 | func NewServerByImpl(cfg Config, mds MetadataService, blocks BlockStore) (*Server, error) { 61 | return &Server{ 62 | Blocks: blocks, 63 | MDS: mds, 64 | INodes: NewINodeStore(blocks), 65 | peersMap: make(map[string]*models.PeerInfo), 66 | Cfg: cfg, 67 | peerInfo: &models.PeerInfo{ 68 | UUID: mds.UUID(), 69 | }, 70 | }, nil 71 | } 72 | -------------------------------------------------------------------------------- /metadata/common.go: -------------------------------------------------------------------------------- 1 | // metadata is the metapackage for the implementations of the metadata 2 | // interface, for each potential backend. 3 | package metadata 4 | 5 | import ( 6 | "errors" 7 | "io/ioutil" 8 | "os" 9 | "path/filepath" 10 | 11 | "github.com/pborman/uuid" 12 | ) 13 | 14 | func MakeUUID() string { 15 | return uuid.NewUUID().String() 16 | } 17 | 18 | // TODO(barakmich): Make into a JSON file? 19 | // This all should be moved to storage/ because that's where it's really owned. 20 | func GetUUID(datadir string) (string, error) { 21 | if datadir == "" { 22 | return "", errors.New("given a empty datadir and asked to get it's UUID") 23 | } 24 | 25 | filename := filepath.Join(datadir, "metadata", "uuid") 26 | if _, err := os.Stat(filename); os.IsNotExist(err) { 27 | id := uuid.NewUUID() 28 | fnew, ferr := os.Create(filename) 29 | if ferr != nil { 30 | return "", ferr 31 | } 32 | defer fnew.Close() 33 | _, werr := fnew.WriteString(id.String()) 34 | return id.String(), werr 35 | } 36 | bytes, err := ioutil.ReadFile(filename) 37 | if err != nil { 38 | return "", err 39 | } 40 | return string(bytes), nil 41 | } 42 | -------------------------------------------------------------------------------- /metadata/etcd/debug.go: -------------------------------------------------------------------------------- 1 | package etcd 2 | 3 | import ( 4 | "io" 5 | 6 | etcdv3 "github.com/coreos/etcd/clientv3" 7 | 8 | "github.com/coreos/torus/models" 9 | ) 10 | 11 | func (c *etcdCtx) DumpMetadata(w io.Writer) error { 12 | io.WriteString(w, "## Volumes\n") 13 | resp, err := c.etcd.Client.Get(c.getContext(), MkKey("volumeid"), etcdv3.WithPrefix()) 14 | if err != nil { 15 | return err 16 | } 17 | for _, x := range resp.Kvs { 18 | io.WriteString(w, string(x.Key)+":\n") 19 | v := &models.Volume{} 20 | v.Unmarshal(x.Value) 21 | io.WriteString(w, v.String()) 22 | io.WriteString(w, "\n") 23 | } 24 | io.WriteString(w, "## INodes\n") 25 | resp, err = c.etcd.Client.Get(c.getContext(), MkKey("volumemeta", "inode"), etcdv3.WithPrefix()) 26 | if err != nil { 27 | return err 28 | } 29 | for _, x := range resp.Kvs { 30 | io.WriteString(w, string(x.Key)+":\n") 31 | v := BytesToUint64(x.Value) 32 | io.WriteString(w, Uint64ToHex(v)) 33 | io.WriteString(w, "\n") 34 | } 35 | io.WriteString(w, "## BlockLocks\n") 36 | resp, err = c.etcd.Client.Get(c.getContext(), MkKey("volumemeta", "blocklock"), etcdv3.WithPrefix()) 37 | if err != nil { 38 | return err 39 | } 40 | for _, x := range resp.Kvs { 41 | io.WriteString(w, string(x.Key)+":\n") 42 | io.WriteString(w, string(x.Value)) 43 | io.WriteString(w, "\n") 44 | } 45 | return nil 46 | } 47 | -------------------------------------------------------------------------------- /metadata/etcd/global_funcs.go: -------------------------------------------------------------------------------- 1 | package etcd 2 | 3 | import ( 4 | "encoding/json" 5 | 6 | "github.com/coreos/torus" 7 | "github.com/coreos/torus/models" 8 | "github.com/coreos/torus/ring" 9 | 10 | etcdv3 "github.com/coreos/etcd/clientv3" 11 | "golang.org/x/net/context" 12 | ) 13 | 14 | func initEtcdMetadata(cfg torus.Config, gmd torus.GlobalMetadata, ringType torus.RingType) error { 15 | gmdbytes, err := json.Marshal(gmd) 16 | if err != nil { 17 | return err 18 | } 19 | emptyRing, err := ring.CreateRing(&models.Ring{ 20 | Type: uint32(ringType), 21 | Version: 1, 22 | ReplicationFactor: 2, 23 | }) 24 | if err != nil { 25 | return err 26 | } 27 | ringb, err := emptyRing.Marshal() 28 | if err != nil { 29 | return err 30 | } 31 | 32 | client, err := etcdv3.New(etcdv3.Config{Endpoints: []string{cfg.MetadataAddress}, TLS: cfg.TLS}) 33 | if err != nil { 34 | return err 35 | } 36 | defer client.Close() 37 | 38 | txn := client.Txn(context.Background()) 39 | resp, err := txn.If( 40 | etcdv3.Compare(etcdv3.Version(MkKey("meta", "globalmetadata")), "=", 0), 41 | ).Then( 42 | etcdv3.OpPut(MkKey("meta", "volumeminter"), string(Uint64ToBytes(1))), 43 | etcdv3.OpPut(MkKey("meta", "globalmetadata"), string(gmdbytes)), 44 | ).Commit() 45 | if err != nil { 46 | return err 47 | } 48 | if !resp.Succeeded { 49 | return torus.ErrExists 50 | } 51 | _, err = client.Put(context.Background(), MkKey("meta", "the-one-ring"), string(ringb)) 52 | if err != nil { 53 | return err 54 | } 55 | return nil 56 | } 57 | 58 | func wipeEtcdMetadata(cfg torus.Config) error { 59 | client, err := etcdv3.New(etcdv3.Config{Endpoints: []string{cfg.MetadataAddress}, TLS: cfg.TLS}) 60 | if err != nil { 61 | return err 62 | } 63 | defer client.Close() 64 | _, err = client.Delete(context.Background(), MkKey(), etcdv3.WithPrefix()) 65 | if err != nil { 66 | return err 67 | } 68 | return nil 69 | } 70 | 71 | func setRing(cfg torus.Config, r torus.Ring) error { 72 | client, err := etcdv3.New(etcdv3.Config{Endpoints: []string{cfg.MetadataAddress}, TLS: cfg.TLS}) 73 | if err != nil { 74 | return err 75 | } 76 | defer client.Close() 77 | 78 | resp, err := client.Get(context.Background(), MkKey("meta", "the-one-ring")) 79 | if err != nil { 80 | return err 81 | } 82 | if len(resp.Kvs) == 0 { 83 | return torus.ErrNoGlobalMetadata 84 | } 85 | oldr, err := ring.Unmarshal(resp.Kvs[0].Value) 86 | if err != nil { 87 | return err 88 | } 89 | if oldr.Version() != r.Version()-1 { 90 | return torus.ErrNonSequentialRing 91 | } 92 | b, err := r.Marshal() 93 | if err != nil { 94 | return err 95 | } 96 | _, err = client.Put(context.Background(), MkKey("meta", "the-one-ring"), string(b)) 97 | return err 98 | } 99 | -------------------------------------------------------------------------------- /metadata/etcd/helpers.go: -------------------------------------------------------------------------------- 1 | package etcd 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "path" 8 | ) 9 | 10 | func MkKey(s ...string) string { 11 | s = append([]string{KeyPrefix}, s...) 12 | return path.Join(s...) 13 | } 14 | 15 | func Uint64ToBytes(x uint64) []byte { 16 | buf := new(bytes.Buffer) 17 | err := binary.Write(buf, binary.LittleEndian, x) 18 | if err != nil { 19 | panic(err) 20 | } 21 | return buf.Bytes() 22 | } 23 | 24 | func BytesToUint64(b []byte) uint64 { 25 | r := bytes.NewReader(b) 26 | var out uint64 27 | err := binary.Read(r, binary.LittleEndian, &out) 28 | if err != nil { 29 | panic(err) 30 | } 31 | return out 32 | } 33 | 34 | func Uint64ToHex(x uint64) string { 35 | return fmt.Sprintf("%x", x) 36 | } 37 | -------------------------------------------------------------------------------- /metadata/etcd/ring_watch.go: -------------------------------------------------------------------------------- 1 | package etcd 2 | 3 | import ( 4 | "golang.org/x/net/context" 5 | 6 | "github.com/coreos/torus" 7 | "github.com/coreos/torus/ring" 8 | ) 9 | 10 | func (e *Etcd) watchRingUpdates() error { 11 | r, err := e.GetRing() 12 | if err != nil { 13 | clog.Errorf("can't get inital ring: %s", err) 14 | return err 15 | } 16 | go e.watchRing(r) 17 | return nil 18 | } 19 | 20 | func (e *Etcd) watchRing(r torus.Ring) { 21 | ctx, cancel := context.WithCancel(e.getContext()) 22 | defer cancel() 23 | wch := e.Client.Watch(ctx, MkKey("meta", "the-one-ring")) 24 | 25 | for resp := range wch { 26 | if err := resp.Err(); err != nil { 27 | clog.Errorf("error watching ring: %s", err) 28 | return 29 | } 30 | for _, ev := range resp.Events { 31 | newRing, err := ring.Unmarshal(ev.Kv.Value) 32 | if err != nil { 33 | clog.Debugf("corrupted ring: %#v", ev.Kv.Value) 34 | clog.Errorf("Failed to unmarshal ring: %s", err) 35 | clog.Error("corrupted ring? Continuing with current ring") 36 | continue 37 | } 38 | 39 | clog.Infof("got new ring") 40 | if r.Version() == newRing.Version() { 41 | clog.Warningf("Same ring version: %d", r.Version()) 42 | } 43 | e.mut.RLock() 44 | for _, x := range e.ringListeners { 45 | x <- newRing 46 | } 47 | r = newRing 48 | e.mut.RUnlock() 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /models/doc.go: -------------------------------------------------------------------------------- 1 | // models is the package containing all the protos used for serializing data for Torus. 2 | package models 3 | -------------------------------------------------------------------------------- /models/extensions.go: -------------------------------------------------------------------------------- 1 | package models 2 | 3 | func NewEmptyINode() *INode { 4 | return &INode{ 5 | Attrs: make(map[string]string), 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /models/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | GOGOPROTO_ROOT="${GOPATH}/src" 4 | GOGOPROTO_PATH="${GOGOPROTO_ROOT}:${GOGOPROTO_ROOT}/protobuf" 5 | protoc --gogofaster_out=plugins=grpc:. -I=.:"${GOGOPROTO_PATH}" *.proto 6 | -------------------------------------------------------------------------------- /models/rpc.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package models; 4 | 5 | import "github.com/gogo/protobuf/gogoproto/gogo.proto"; 6 | import "torus.proto"; 7 | 8 | option (gogoproto.equal_all) = true; 9 | option (gogoproto.verbose_equal_all) = true; 10 | 11 | 12 | option (gogoproto.unmarshaler_all) = true; 13 | option (gogoproto.marshaler_all) = true; 14 | option (gogoproto.sizer_all) = true; 15 | option (gogoproto.testgen_all) = true; 16 | option (gogoproto.benchgen_all) = true; 17 | option (gogoproto.populate_all) = true; 18 | 19 | service TorusStorage { 20 | rpc Block (BlockRequest) returns (BlockResponse); 21 | rpc PutBlock (PutBlockRequest) returns (PutResponse); 22 | rpc RebalanceCheck (RebalanceCheckRequest) returns (RebalanceCheckResponse); 23 | } 24 | 25 | message BlockRequest { 26 | BlockRef block_ref = 1; 27 | } 28 | 29 | message BlockResponse { 30 | bool ok = 1; 31 | bytes data = 2; 32 | } 33 | 34 | message PutBlockRequest { 35 | repeated BlockRef refs = 1; 36 | repeated bytes blocks = 2; 37 | } 38 | 39 | message PutResponse { 40 | bool ok = 1; 41 | string err = 2; 42 | } 43 | 44 | message RebalanceCheckRequest { 45 | repeated BlockRef block_refs = 1; 46 | } 47 | 48 | message RebalanceCheckResponse { 49 | repeated bool valid = 1; 50 | int32 status = 2; 51 | } 52 | -------------------------------------------------------------------------------- /models/torus.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package models; 4 | 5 | import "github.com/gogo/protobuf/gogoproto/gogo.proto"; 6 | 7 | option (gogoproto.equal_all) = true; 8 | option (gogoproto.verbose_equal_all) = true; 9 | 10 | option (gogoproto.unmarshaler_all) = true; 11 | option (gogoproto.marshaler_all) = true; 12 | option (gogoproto.sizer_all) = true; 13 | option (gogoproto.testgen_all) = true; 14 | option (gogoproto.benchgen_all) = true; 15 | option (gogoproto.populate_all) = true; 16 | 17 | message INode { 18 | uint64 volume = 1; 19 | uint64 inode = 2 [(gogoproto.customname) = "INode"]; 20 | uint64 filesize = 4; 21 | map attrs = 7; 22 | repeated BlockLayer blocks = 8; 23 | } 24 | 25 | message BlockLayer { 26 | uint32 type = 1; 27 | bytes content = 2; 28 | } 29 | 30 | message Volume { 31 | string name = 1; 32 | uint64 id = 2; 33 | string type = 3; 34 | 35 | // TODO(barakmich): Respect sizes for FILE volumes. 36 | uint64 max_bytes = 4; 37 | } 38 | 39 | message PeerInfo { 40 | string uuid = 1 [(gogoproto.customname) = "UUID"]; 41 | string address = 2; 42 | 43 | int64 last_seen = 3; // In Unix nanoseconds. 44 | uint64 total_blocks = 4; 45 | uint64 used_blocks = 5; 46 | bool timed_out = 6; 47 | 48 | RebalanceInfo rebalance_info = 7; 49 | 50 | // ProtocolVersion is set by each peer to know if we're out of date or if a 51 | // protocol migration has occured. 52 | uint64 protocol_version = 8; 53 | } 54 | 55 | message RebalanceInfo { 56 | int64 last_rebalance_finish = 1; // In Unix nanoseconds. 57 | uint64 last_rebalance_blocks = 2; 58 | bool rebalancing = 3; 59 | } 60 | 61 | message Ring { 62 | uint32 type = 1; 63 | uint32 version = 2; 64 | uint32 replication_factor = 3; 65 | repeated PeerInfo peers = 4; 66 | map attrs = 5; 67 | } 68 | 69 | message BlockRef { 70 | uint64 volume = 1; 71 | uint64 inode = 2 [(gogoproto.customname) = "INode"]; 72 | uint64 block = 3; 73 | } 74 | 75 | message INodeRef { 76 | uint64 volume = 1; 77 | uint64 inode = 2 [(gogoproto.customname) = "INode"]; 78 | } 79 | -------------------------------------------------------------------------------- /ring.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "math/big" 5 | 6 | "github.com/coreos/torus/models" 7 | ) 8 | 9 | type RingType int 10 | 11 | type Ring interface { 12 | GetPeers(key BlockRef) (PeerPermutation, error) 13 | Members() PeerList 14 | 15 | Describe() string 16 | Type() RingType 17 | Version() int 18 | 19 | Marshal() ([]byte, error) 20 | } 21 | 22 | type ModifyableRing interface { 23 | ChangeReplication(r int) (Ring, error) 24 | } 25 | 26 | type RingAdder interface { 27 | ModifyableRing 28 | AddPeers(PeerInfoList) (Ring, error) 29 | } 30 | 31 | type RingRemover interface { 32 | ModifyableRing 33 | RemovePeers(PeerList) (Ring, error) 34 | } 35 | 36 | type PeerPermutation struct { 37 | Replication int 38 | Peers PeerList 39 | } 40 | 41 | type PeerList []string 42 | 43 | func (pl PeerList) IndexAt(uuid string) int { 44 | for i, x := range pl { 45 | if x == uuid { 46 | return i 47 | } 48 | } 49 | return -1 50 | } 51 | 52 | func (pl PeerList) Has(uuid string) bool { 53 | return pl.IndexAt(uuid) != -1 54 | } 55 | 56 | func (pl PeerList) AndNot(b PeerList) PeerList { 57 | var out PeerList 58 | for _, x := range pl { 59 | if !b.Has(x) { 60 | out = append(out, x) 61 | } 62 | } 63 | return out 64 | } 65 | 66 | func (pl PeerList) Union(b PeerList) PeerList { 67 | var out PeerList 68 | for _, x := range pl { 69 | out = append(out, x) 70 | } 71 | for _, x := range b { 72 | if !pl.Has(x) { 73 | out = append(out, x) 74 | } 75 | } 76 | return out 77 | } 78 | 79 | func (pl PeerList) Intersect(b PeerList) PeerList { 80 | var out PeerList 81 | for _, x := range pl { 82 | if b.Has(x) { 83 | out = append(out, x) 84 | } 85 | } 86 | return out 87 | } 88 | 89 | // Applicative! Applicative! My kingdom for Applicative! 90 | 91 | type PeerInfoList []*models.PeerInfo 92 | 93 | func (pi PeerInfoList) UUIDAt(uuid string) int { 94 | for i, x := range pi { 95 | if x.UUID == uuid { 96 | return i 97 | } 98 | } 99 | return -1 100 | } 101 | 102 | func (pi PeerInfoList) HasUUID(uuid string) bool { 103 | return pi.UUIDAt(uuid) != -1 104 | } 105 | 106 | func (pi PeerInfoList) AndNot(b PeerList) PeerInfoList { 107 | var out PeerInfoList 108 | for _, x := range pi { 109 | if !b.Has(x.UUID) { 110 | out = append(out, x) 111 | } 112 | } 113 | return out 114 | } 115 | 116 | func (pi PeerInfoList) Union(b PeerInfoList) PeerInfoList { 117 | var out PeerInfoList 118 | for _, x := range pi { 119 | out = append(out, x) 120 | } 121 | for _, x := range b { 122 | if !pi.HasUUID(x.UUID) { 123 | out = append(out, x) 124 | } 125 | } 126 | return out 127 | } 128 | 129 | func (pi PeerInfoList) Intersect(b PeerInfoList) PeerInfoList { 130 | var out PeerInfoList 131 | for _, x := range pi { 132 | if b.HasUUID(x.UUID) { 133 | out = append(out, x) 134 | } 135 | } 136 | return out 137 | } 138 | 139 | func (pi PeerInfoList) PeerList() PeerList { 140 | out := make([]string, len(pi)) 141 | for i, x := range pi { 142 | out[i] = x.UUID 143 | } 144 | return PeerList(out) 145 | } 146 | 147 | func (pi PeerInfoList) GetWeights() map[string]int { 148 | out := make(map[string]int) 149 | if len(pi) == 0 { 150 | return out 151 | } 152 | gcd := big.NewInt(int64(pi[0].TotalBlocks)) 153 | for _, p := range pi[1:] { 154 | gcd.GCD(nil, nil, gcd, big.NewInt(int64(p.TotalBlocks))) 155 | } 156 | for _, p := range pi { 157 | out[p.UUID] = int(p.TotalBlocks / uint64(gcd.Int64())) 158 | clog.Infof("%s: weight %d", p.UUID, out[p.UUID]) 159 | } 160 | return out 161 | } 162 | -------------------------------------------------------------------------------- /ring/empty.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "github.com/coreos/torus" 5 | "github.com/coreos/torus/models" 6 | ) 7 | 8 | type empty struct { 9 | version int 10 | } 11 | 12 | func init() { 13 | registerRing(Empty, "empty", makeEmpty) 14 | } 15 | 16 | func makeEmpty(r *models.Ring) (torus.Ring, error) { 17 | return &empty{ 18 | version: int(r.Version), 19 | }, nil 20 | } 21 | 22 | func (e *empty) GetPeers(key torus.BlockRef) (torus.PeerPermutation, error) { 23 | return torus.PeerPermutation{ 24 | Peers: []string{}, 25 | Replication: 0, 26 | }, nil 27 | } 28 | 29 | func (e *empty) Members() torus.PeerList { return []string{} } 30 | 31 | func (e *empty) Describe() string { 32 | return "Ring: Empty" 33 | } 34 | func (e *empty) Type() torus.RingType { return Empty } 35 | func (e *empty) Version() int { return e.version } 36 | 37 | func (e *empty) Marshal() ([]byte, error) { 38 | var out models.Ring 39 | 40 | out.Version = uint32(e.version) 41 | out.Type = uint32(e.Type()) 42 | return out.Marshal() 43 | } 44 | -------------------------------------------------------------------------------- /ring/ketama.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "reflect" 7 | 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/models" 10 | 11 | "github.com/serialx/hashring" 12 | ) 13 | 14 | type ketama struct { 15 | version int 16 | rep int 17 | peers torus.PeerInfoList 18 | ring *hashring.HashRing 19 | } 20 | 21 | func init() { 22 | registerRing(Ketama, "ketama", makeKetama) 23 | } 24 | 25 | func makeKetama(r *models.Ring) (torus.Ring, error) { 26 | rep := int(r.ReplicationFactor) 27 | if rep == 0 { 28 | rep = 1 29 | } 30 | pi := torus.PeerInfoList(r.Peers) 31 | if rep > len(pi) { 32 | clog.Noticef("Using ring that requests replication level %d, but has only %d peers. Add nodes to match replication.", rep, len(pi)) 33 | } 34 | return &ketama{ 35 | version: int(r.Version), 36 | peers: pi, 37 | rep: rep, 38 | ring: hashring.NewWithWeights(pi.GetWeights()), 39 | }, nil 40 | } 41 | 42 | func (k *ketama) GetPeers(key torus.BlockRef) (torus.PeerPermutation, error) { 43 | s, ok := k.ring.GetNodes(string(key.ToBytes()), len(k.peers)) 44 | if !ok { 45 | if len(s) == 0 { 46 | return torus.PeerPermutation{}, errors.New("couldn't get any nodes") 47 | } 48 | for _, x := range k.peers { 49 | has := false 50 | for _, y := range s { 51 | if y == x.UUID { 52 | has = true 53 | break 54 | } 55 | } 56 | if !has { 57 | s = append(s, x.UUID) 58 | } 59 | } 60 | } 61 | 62 | if len(s) != len(k.peers) { 63 | return torus.PeerPermutation{}, errors.New("couldn't get sufficient nodes") 64 | } 65 | 66 | rep := k.rep 67 | if len(k.peers) < k.rep { 68 | rep = len(k.peers) 69 | } 70 | 71 | return torus.PeerPermutation{ 72 | Peers: s, 73 | Replication: rep, 74 | }, nil 75 | } 76 | 77 | func (k *ketama) Members() torus.PeerList { return k.peers.PeerList() } 78 | 79 | func (k *ketama) Describe() string { 80 | s := fmt.Sprintf("Ring: Ketama\nReplication:%d\nPeers:", k.rep) 81 | for _, x := range k.peers { 82 | s += fmt.Sprintf("\n\t%s", x) 83 | } 84 | return s 85 | } 86 | func (k *ketama) Type() torus.RingType { return Ketama } 87 | func (k *ketama) Version() int { return k.version } 88 | 89 | func (k *ketama) Marshal() ([]byte, error) { 90 | var out models.Ring 91 | 92 | out.Version = uint32(k.version) 93 | out.ReplicationFactor = uint32(k.rep) 94 | out.Type = uint32(k.Type()) 95 | out.Peers = k.peers 96 | return out.Marshal() 97 | } 98 | 99 | func (k *ketama) AddPeers(peers torus.PeerInfoList) (torus.Ring, error) { 100 | newPeers := k.peers.Union(peers) 101 | if reflect.DeepEqual(newPeers.PeerList(), k.peers.PeerList()) { 102 | return nil, torus.ErrExists 103 | } 104 | newk := &ketama{ 105 | version: k.version + 1, 106 | rep: k.rep, 107 | peers: newPeers, 108 | ring: hashring.NewWithWeights(newPeers.GetWeights()), 109 | } 110 | return newk, nil 111 | } 112 | 113 | func (k *ketama) RemovePeers(pl torus.PeerList) (torus.Ring, error) { 114 | newPeers := k.peers.AndNot(pl) 115 | if len(newPeers) == len(k.Members()) { 116 | return nil, torus.ErrNotExist 117 | } 118 | 119 | newk := &ketama{ 120 | version: k.version + 1, 121 | rep: k.rep, 122 | peers: newPeers, 123 | ring: hashring.NewWithWeights(newPeers.GetWeights()), 124 | } 125 | return newk, nil 126 | } 127 | 128 | func (k *ketama) ChangeReplication(r int) (torus.Ring, error) { 129 | newk := &ketama{ 130 | version: k.version + 1, 131 | rep: r, 132 | peers: k.peers, 133 | ring: k.ring, 134 | } 135 | return newk, nil 136 | } 137 | -------------------------------------------------------------------------------- /ring/ketama_test.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/coreos/torus" 7 | "github.com/coreos/torus/models" 8 | "github.com/serialx/hashring" 9 | ) 10 | 11 | func TestTinyPeer(t *testing.T) { 12 | pi := torus.PeerInfoList{ 13 | &models.PeerInfo{ 14 | UUID: "a", 15 | TotalBlocks: 20 * 1024 * 1024 * 2, 16 | }, 17 | &models.PeerInfo{ 18 | UUID: "b", 19 | TotalBlocks: 20 * 1024 * 1024 * 2, 20 | }, 21 | &models.PeerInfo{ 22 | UUID: "c", 23 | TotalBlocks: 100 * 1024 * 2, 24 | }, 25 | } 26 | k := &ketama{ 27 | version: 1, 28 | peers: pi, 29 | rep: 2, 30 | ring: hashring.NewWithWeights(pi.GetWeights()), 31 | } 32 | l, err := k.GetPeers(torus.BlockRef{ 33 | INodeRef: torus.NewINodeRef(3, 4), 34 | Index: 5, 35 | }) 36 | if err != nil { 37 | t.Fatal(err) 38 | } 39 | t.Log(l.Peers) 40 | } 41 | -------------------------------------------------------------------------------- /ring/mod.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "fmt" 5 | "hash/crc32" 6 | "reflect" 7 | "sort" 8 | 9 | "github.com/coreos/torus" 10 | "github.com/coreos/torus/models" 11 | ) 12 | 13 | type mod struct { 14 | version int 15 | rep int 16 | peers torus.PeerInfoList 17 | } 18 | 19 | func init() { 20 | registerRing(Mod, "mod", makeMod) 21 | } 22 | 23 | func makeMod(r *models.Ring) (torus.Ring, error) { 24 | rep := int(r.ReplicationFactor) 25 | if rep == 0 { 26 | rep = 1 27 | } 28 | pil := torus.PeerInfoList(r.Peers) 29 | if rep > len(pil) { 30 | clog.Noticef("Requested replication level %d, but has only %d peers. Add nodes to match replication.", rep, len(pil)) 31 | } 32 | return &mod{ 33 | version: int(r.Version), 34 | peers: pil, 35 | rep: rep, 36 | }, nil 37 | } 38 | 39 | func (m *mod) GetPeers(key torus.BlockRef) (torus.PeerPermutation, error) { 40 | peerlist := sort.StringSlice([]string(m.peers.PeerList())) 41 | if len(peerlist) == 0 { 42 | return torus.PeerPermutation{}, fmt.Errorf("couldn't get any nodes") 43 | } 44 | if len(peerlist) != len(m.peers) { 45 | return torus.PeerPermutation{}, fmt.Errorf("couldn't get sufficient nodes") 46 | } 47 | permute := make([]string, len(peerlist)) 48 | crc := crc32.ChecksumIEEE(key.ToBytes()) 49 | sum := int(crc) % len(m.peers) 50 | copy(permute, peerlist[sum:]) 51 | copy(permute[len(peerlist)-sum:], peerlist[:sum]) 52 | rep := m.rep 53 | if len(m.peers) < m.rep { 54 | rep = len(m.peers) 55 | } 56 | return torus.PeerPermutation{ 57 | Peers: permute, 58 | Replication: rep, 59 | }, nil 60 | } 61 | 62 | func (m *mod) Members() torus.PeerList { return m.peers.PeerList() } 63 | 64 | func (m *mod) Describe() string { 65 | s := fmt.Sprintf("Ring: Mod\nReplication:%d\nPeers:", m.rep) 66 | for _, x := range m.peers { 67 | s += fmt.Sprintf("\n\t%s", x) 68 | } 69 | return s 70 | } 71 | func (m *mod) Type() torus.RingType { return Mod } 72 | func (m *mod) Version() int { return m.version } 73 | 74 | func (m *mod) Marshal() ([]byte, error) { 75 | var out models.Ring 76 | 77 | out.Version = uint32(m.version) 78 | out.ReplicationFactor = uint32(m.rep) 79 | out.Type = uint32(m.Type()) 80 | out.Peers = m.peers 81 | return out.Marshal() 82 | } 83 | 84 | func (m *mod) AddPeers(peers torus.PeerInfoList) (torus.Ring, error) { 85 | newPeers := m.peers.Union(peers) 86 | if reflect.DeepEqual(newPeers.PeerList(), m.peers.PeerList()) { 87 | return nil, torus.ErrExists 88 | } 89 | newm := &mod{ 90 | version: m.version + 1, 91 | rep: m.rep, 92 | peers: newPeers, 93 | } 94 | return newm, nil 95 | } 96 | 97 | func (m *mod) RemovePeers(pl torus.PeerList) (torus.Ring, error) { 98 | newPeers := m.peers.AndNot(pl) 99 | if len(newPeers) == len(m.peers) { 100 | return nil, torus.ErrNotExist 101 | } 102 | 103 | newm := &mod{ 104 | version: m.version + 1, 105 | rep: m.rep, 106 | peers: newPeers, 107 | } 108 | return newm, nil 109 | } 110 | 111 | func (m *mod) ChangeReplication(r int) (torus.Ring, error) { 112 | newm := &mod{ 113 | version: m.version + 1, 114 | rep: r, 115 | peers: m.peers, 116 | } 117 | return newm, nil 118 | } 119 | -------------------------------------------------------------------------------- /ring/ring_main.go: -------------------------------------------------------------------------------- 1 | // ring is the package containing implementations of the consistent hash ring, a 2 | // pure function which provides a permutation of peers where a block can live, 3 | // known by all members of the cluster. 4 | package ring 5 | 6 | import ( 7 | "github.com/coreos/pkg/capnslog" 8 | "github.com/coreos/torus" 9 | "github.com/coreos/torus/models" 10 | ) 11 | 12 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "ring") 13 | 14 | const ( 15 | Empty torus.RingType = iota 16 | Single 17 | Mod 18 | Union 19 | Ketama 20 | ) 21 | 22 | func Unmarshal(b []byte) (torus.Ring, error) { 23 | var a models.Ring 24 | err := a.Unmarshal(b) 25 | if err != nil { 26 | return nil, err 27 | } 28 | return CreateRing(&a) 29 | } 30 | 31 | type createRingFunc func(r *models.Ring) (torus.Ring, error) 32 | 33 | var ringRegistry map[torus.RingType]createRingFunc 34 | var ringNames map[string]torus.RingType 35 | 36 | func registerRing(t torus.RingType, name string, newFunc createRingFunc) { 37 | if ringRegistry == nil { 38 | ringRegistry = make(map[torus.RingType]createRingFunc) 39 | } 40 | 41 | if _, ok := ringRegistry[t]; ok { 42 | panic("torus: attempted to register ring type " + string(t) + " twice") 43 | } 44 | 45 | ringRegistry[t] = newFunc 46 | 47 | if ringNames == nil { 48 | ringNames = make(map[string]torus.RingType) 49 | } 50 | 51 | if _, ok := ringNames[name]; ok { 52 | panic("torus: attempted to register ring name " + name + " twice") 53 | } 54 | 55 | ringNames[name] = t 56 | } 57 | 58 | func CreateRing(r *models.Ring) (torus.Ring, error) { 59 | return ringRegistry[torus.RingType(r.Type)](r) 60 | } 61 | 62 | func RingTypeFromString(s string) (torus.RingType, bool) { 63 | v, ok := ringNames[s] 64 | return v, ok 65 | } 66 | -------------------------------------------------------------------------------- /ring/single.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/coreos/torus" 7 | "github.com/coreos/torus/models" 8 | ) 9 | 10 | type single struct { 11 | version int 12 | peer *models.PeerInfo 13 | permutation torus.PeerPermutation 14 | } 15 | 16 | func init() { 17 | registerRing(Single, "single", makeSingle) 18 | } 19 | 20 | func makeSingle(r *models.Ring) (torus.Ring, error) { 21 | if len(r.Peers) != 1 { 22 | return nil, torus.ErrInvalid 23 | } 24 | return &single{ 25 | version: int(r.Version), 26 | peer: r.Peers[0], 27 | permutation: torus.PeerPermutation{ 28 | Peers: []string{r.Peers[0].UUID}, 29 | Replication: 1, 30 | }, 31 | }, nil 32 | } 33 | 34 | func (s *single) GetPeers(key torus.BlockRef) (torus.PeerPermutation, error) { 35 | return s.permutation, nil 36 | } 37 | 38 | func (s *single) Members() torus.PeerList { return []string{s.peer.UUID} } 39 | 40 | func (s *single) Describe() string { 41 | return fmt.Sprintf("Ring: Single\nUUID: %s", s.peer.UUID) 42 | } 43 | func (s *single) Type() torus.RingType { return Single } 44 | func (s *single) Version() int { return s.version } 45 | 46 | func (s *single) Marshal() ([]byte, error) { 47 | var out models.Ring 48 | 49 | out.Version = uint32(s.version) 50 | out.Type = uint32(s.Type()) 51 | out.Peers = []*models.PeerInfo{s.peer} 52 | return out.Marshal() 53 | } 54 | -------------------------------------------------------------------------------- /ring/union.go: -------------------------------------------------------------------------------- 1 | package ring 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | "github.com/coreos/torus" 8 | "github.com/coreos/torus/models" 9 | ) 10 | 11 | type unionRing struct { 12 | oldRing torus.Ring 13 | newRing torus.Ring 14 | } 15 | 16 | func init() { 17 | registerRing(Union, "union", makeUnion) 18 | } 19 | 20 | func makeUnion(r *models.Ring) (torus.Ring, error) { 21 | var err error 22 | out := &unionRing{} 23 | oldb, ok := r.Attrs["old"] 24 | if !ok { 25 | return nil, errors.New("no old ring in union ring data") 26 | } 27 | out.oldRing, err = Unmarshal(oldb) 28 | if err != nil { 29 | return nil, err 30 | } 31 | newb, ok := r.Attrs["new"] 32 | if !ok { 33 | return nil, errors.New("no new ring in union ring data") 34 | } 35 | out.newRing, err = Unmarshal(newb) 36 | if err != nil { 37 | return nil, err 38 | } 39 | return out, nil 40 | } 41 | 42 | func NewUnionRing(oldRing torus.Ring, newRing torus.Ring) torus.Ring { 43 | return &unionRing{ 44 | oldRing: oldRing, 45 | newRing: newRing, 46 | } 47 | } 48 | 49 | func (u *unionRing) GetPeers(key torus.BlockRef) (torus.PeerPermutation, error) { 50 | n, err := u.newRing.GetPeers(key) 51 | if err != nil { 52 | return torus.PeerPermutation{}, err 53 | } 54 | o, err := u.oldRing.GetPeers(key) 55 | if err != nil { 56 | return torus.PeerPermutation{}, err 57 | } 58 | return torus.PeerPermutation{ 59 | Peers: o.Peers.Union(n.Peers), 60 | Replication: n.Replication, 61 | }, nil 62 | } 63 | 64 | func (u *unionRing) Members() torus.PeerList { 65 | return u.newRing.Members().Union(u.oldRing.Members()) 66 | } 67 | 68 | func (u *unionRing) Describe() string { 69 | return fmt.Sprintf( 70 | "Union Ring:\nOld:\n%s\nNew:\n%s", 71 | u.oldRing.Describe(), 72 | u.newRing.Describe(), 73 | ) 74 | } 75 | func (u *unionRing) Type() torus.RingType { 76 | return Union 77 | } 78 | func (u *unionRing) Version() int { 79 | return u.newRing.Version() 80 | } 81 | 82 | func (u *unionRing) Marshal() ([]byte, error) { 83 | var out models.Ring 84 | 85 | out.Version = uint32(u.Version()) 86 | out.Type = uint32(u.Type()) 87 | out.Attrs = make(map[string][]byte) 88 | b, err := u.oldRing.Marshal() 89 | if err != nil { 90 | return nil, err 91 | } 92 | out.Attrs["old"] = b 93 | b, err = u.newRing.Marshal() 94 | if err != nil { 95 | return nil, err 96 | } 97 | out.Attrs["new"] = b 98 | return out.Marshal() 99 | } 100 | -------------------------------------------------------------------------------- /server.go: -------------------------------------------------------------------------------- 1 | package torus 2 | 3 | import ( 4 | "io" 5 | "sync" 6 | 7 | "golang.org/x/net/context" 8 | 9 | "github.com/coreos/torus/models" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | var ( 14 | promOps = prometheus.NewCounterVec(prometheus.CounterOpts{ 15 | Name: "torus_server_ops_total", 16 | Help: "Number of times an atomic update failed and needed to be retried", 17 | }, []string{"kind"}) 18 | ) 19 | 20 | func init() { 21 | prometheus.MustRegister(promOps) 22 | } 23 | 24 | const ( 25 | CtxWriteLevel int = iota 26 | CtxReadLevel 27 | ) 28 | 29 | // Server is the type representing the generic distributed block store. 30 | type Server struct { 31 | mut sync.RWMutex 32 | infoMut sync.Mutex 33 | Blocks BlockStore 34 | MDS MetadataService 35 | INodes *INodeStore 36 | peersMap map[string]*models.PeerInfo 37 | closeChans []chan interface{} 38 | Cfg Config 39 | peerInfo *models.PeerInfo 40 | ctx context.Context 41 | 42 | lease int64 43 | leaseMut sync.RWMutex 44 | 45 | heartbeating bool 46 | ReplicationOpen bool 47 | timeoutCallbacks []func(string) 48 | } 49 | 50 | func (s *Server) createOrRenewLease(ctx context.Context) error { 51 | s.leaseMut.Lock() 52 | defer s.leaseMut.Unlock() 53 | if s.lease != 0 { 54 | err := s.MDS.WithContext(ctx).RenewLease(s.lease) 55 | if err == nil { 56 | return nil 57 | } 58 | clog.Errorf("Failed to renew, grant new lease for %d: %s", s.lease, err) 59 | } 60 | var err error 61 | s.lease, err = s.MDS.WithContext(ctx).GetLease() 62 | return err 63 | } 64 | 65 | func (s *Server) Lease() int64 { 66 | s.leaseMut.RLock() 67 | defer s.leaseMut.RUnlock() 68 | return s.lease 69 | } 70 | 71 | func (s *Server) Close() error { 72 | for _, c := range s.closeChans { 73 | close(c) 74 | } 75 | err := s.MDS.Close() 76 | if err != nil { 77 | clog.Errorf("couldn't close mds: %s", err) 78 | return err 79 | } 80 | err = s.INodes.Close() 81 | if err != nil { 82 | clog.Errorf("couldn't close inodes: %s", err) 83 | return err 84 | } 85 | err = s.Blocks.Close() 86 | if err != nil { 87 | clog.Errorf("couldn't close blocks: %s", err) 88 | return err 89 | } 90 | return nil 91 | } 92 | 93 | // Debug writes a bunch of debug output to the io.Writer. 94 | func (s *Server) Debug(w io.Writer) error { 95 | if v, ok := s.MDS.(DebugMetadataService); ok { 96 | io.WriteString(w, "# MDS\n") 97 | return v.DumpMetadata(w) 98 | } 99 | return nil 100 | } 101 | 102 | func (s *Server) getContext() context.Context { 103 | if s.ctx == nil { 104 | s.ctx = s.ExtendContext(context.TODO()) 105 | } 106 | return s.ctx 107 | } 108 | 109 | func (s *Server) ExtendContext(ctx context.Context) context.Context { 110 | wl := context.WithValue(ctx, CtxWriteLevel, s.Cfg.WriteLevel) 111 | rl := context.WithValue(wl, CtxReadLevel, s.Cfg.ReadLevel) 112 | return rl 113 | } 114 | 115 | func (s *Server) GetPeerMap() map[string]*models.PeerInfo { 116 | s.infoMut.Lock() 117 | defer s.infoMut.Unlock() 118 | out := make(map[string]*models.PeerInfo) 119 | for k, v := range s.peersMap { 120 | out[k] = v 121 | } 122 | return out 123 | } 124 | -------------------------------------------------------------------------------- /storage/common.go: -------------------------------------------------------------------------------- 1 | // storage is the package which implements the underlying, on-disk storage 2 | // API for Torus servers. A single node can be tested with just a storage 3 | // implementation, but the distributor replaces it as a virtual implementation 4 | // of a much larger storage pool, provided by the cluster. This is storage 5 | // underneath a single node. 6 | package storage 7 | 8 | import ( 9 | "github.com/coreos/pkg/capnslog" 10 | "github.com/prometheus/client_golang/prometheus" 11 | ) 12 | 13 | var clog = capnslog.NewPackageLogger("github.com/coreos/torus", "storage") 14 | 15 | var ( 16 | promBlocks = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 17 | Name: "torus_storage_blocks", 18 | Help: "Gauge of number of blocks in local storage", 19 | }, []string{"storage"}) 20 | promBlocksAvail = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 21 | Name: "torus_storage_blocks_total", 22 | Help: "Gauge of number of blocks available in local storage", 23 | }, []string{"storage"}) 24 | promBlocksRetrieved = prometheus.NewCounterVec(prometheus.CounterOpts{ 25 | Name: "torus_storage_read_blocks", 26 | Help: "Number of blocks returned from local block storage", 27 | }, []string{"storage"}) 28 | promBlocksFailed = prometheus.NewCounterVec(prometheus.CounterOpts{ 29 | Name: "torus_storage_failed_blocks", 30 | Help: "Number of blocks failed to be returned from local block storage", 31 | }, []string{"storage"}) 32 | promBlocksWritten = prometheus.NewCounterVec(prometheus.CounterOpts{ 33 | Name: "torus_storage_written_blocks", 34 | Help: "Number of blocks written to local block storage", 35 | }, []string{"storage"}) 36 | promBlockWritesFailed = prometheus.NewCounterVec(prometheus.CounterOpts{ 37 | Name: "torus_storage_failed_written_blocks", 38 | Help: "Number of blocks failed to be written to local block storage", 39 | }, []string{"storage"}) 40 | promBlocksDeleted = prometheus.NewCounterVec(prometheus.CounterOpts{ 41 | Name: "torus_storage_deleted_blocks", 42 | Help: "Number of blocks deleted from local block storage", 43 | }, []string{"storage"}) 44 | promBlockDeletesFailed = prometheus.NewCounterVec(prometheus.CounterOpts{ 45 | Name: "torus_storage_failed_deleted_blocks", 46 | Help: "Number of blocks failed to be deleted from local block storage", 47 | }, []string{"storage"}) 48 | promStorageFlushes = prometheus.NewCounterVec(prometheus.CounterOpts{ 49 | Name: "torus_storage_flushes", 50 | Help: "Number of times the storage layer is synced to disk", 51 | }, []string{"storage"}) 52 | promBytesPerBlock = prometheus.NewGauge(prometheus.GaugeOpts{ 53 | Name: "torus_storage_block_bytes", 54 | Help: "Number of bytes per block in the storage layer", 55 | }) 56 | ) 57 | 58 | func init() { 59 | prometheus.MustRegister(promBlocks) 60 | prometheus.MustRegister(promBlocksAvail) 61 | prometheus.MustRegister(promBlocksRetrieved) 62 | prometheus.MustRegister(promBlocksFailed) 63 | prometheus.MustRegister(promBlocksWritten) 64 | prometheus.MustRegister(promBlockWritesFailed) 65 | prometheus.MustRegister(promBlocksDeleted) 66 | prometheus.MustRegister(promBlockDeletesFailed) 67 | prometheus.MustRegister(promStorageFlushes) 68 | prometheus.MustRegister(promBytesPerBlock) 69 | } 70 | -------------------------------------------------------------------------------- /storage/mmap_file.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/barakmich/mmap-go" 9 | ) 10 | 11 | type MFile struct { 12 | mmap mmap.MMap 13 | blkSize uint64 14 | size uint64 15 | } 16 | 17 | func CreateOrOpenMFile(path string, size uint64, blkSize uint64) (*MFile, error) { 18 | finfo, err := os.Stat(path) 19 | if os.IsNotExist(err) { 20 | err := CreateMFile(path, size) 21 | if err != nil { 22 | return nil, err 23 | } 24 | } 25 | if finfo != nil && finfo.Size() != int64(size) { 26 | if finfo.Size() > int64(size) { 27 | return nil, fmt.Errorf("Specified size %d is smaller than current size %d.", size, finfo.Size()) 28 | } 29 | clog.Debugf("mfile: expand %s %d to %d", path, finfo.Size(), size) 30 | os.Truncate(path, int64(size)) 31 | } 32 | return OpenMFile(path, blkSize) 33 | } 34 | 35 | func CreateMFile(path string, size uint64) error { 36 | f, err := os.Create(path) 37 | if err != nil { 38 | return err 39 | } 40 | defer f.Close() 41 | return f.Truncate(int64(size)) 42 | } 43 | 44 | func OpenMFile(path string, blkSize uint64) (*MFile, error) { 45 | var mf MFile 46 | 47 | f, err := os.OpenFile(path, os.O_RDWR, 0644) 48 | if err != nil { 49 | return nil, err 50 | } 51 | // We don't need the file handle after we mmap it. 52 | // See http://stackoverflow.com/questions/17490033/do-i-need-to-keep-a-file-open-after-calling-mmap-on-it. 53 | defer f.Close() 54 | 55 | st, err := f.Stat() 56 | if err != nil { 57 | return nil, err 58 | } 59 | mf.size = uint64(st.Size()) 60 | if mf.size%blkSize != 0 { 61 | return nil, fmt.Errorf("File size is not a multiple of the block size: %d size, %d blksize", mf.size, blkSize) 62 | } 63 | mf.mmap, err = mmap.Map(f, mmap.RDWR, 0) 64 | if err != nil { 65 | return nil, err 66 | } 67 | mf.blkSize = blkSize 68 | return &mf, nil 69 | } 70 | 71 | // GetBlock returns the n-th block as a byte slice, including any trailing zero padding. 72 | // The returned bytes are from the underlying mmap'd buffer and will be invalid after a call to Close(). 73 | func (m *MFile) GetBlock(n uint64) []byte { 74 | offset := n * m.blkSize 75 | if offset >= m.size { 76 | return nil 77 | } 78 | return m.mmap[offset : offset+m.blkSize] 79 | } 80 | 81 | // NumBlocks returns the total capacity of the file in blocks. 82 | func (m *MFile) NumBlocks() uint64 { 83 | return m.size / m.blkSize 84 | } 85 | 86 | // WriteBlock writes data to the n-th block in the file. 87 | func (m *MFile) WriteBlock(n uint64, data []byte) error { 88 | size := uint64(len(data)) 89 | if size > m.blkSize { 90 | return errors.New("Data block too large") 91 | } 92 | if size == m.blkSize { 93 | offset := n * m.blkSize 94 | copy(m.mmap[offset:], data) 95 | return nil 96 | } 97 | 98 | blk := m.GetBlock(n) 99 | if blk == nil { 100 | return errors.New("Offset too large") 101 | } 102 | 103 | // Copy the data and fill the rest of the block with zeros. 104 | zero(blk[copy(blk, data):]) 105 | return nil 106 | } 107 | 108 | func (m *MFile) Flush() error { 109 | return m.mmap.FlushAsync() 110 | } 111 | 112 | func (m *MFile) Close() error { 113 | if err := m.mmap.Flush(); err != nil { 114 | return err 115 | } 116 | return m.mmap.Unmap() 117 | } 118 | 119 | func zero(b []byte) { 120 | zeros := make([]byte, 8*1024) 121 | for len(b) > 0 { 122 | b = b[copy(b, zeros):] 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /storage/mmap_file_test.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "io/ioutil" 5 | "os" 6 | "reflect" 7 | "testing" 8 | ) 9 | 10 | func makeTempFilename() (string, error) { 11 | f, err := ioutil.TempFile("", "mfiletest") 12 | if err != nil { 13 | return "", err 14 | } 15 | n := f.Name() 16 | f.Close() 17 | os.Remove(n) 18 | return n, nil 19 | } 20 | 21 | func benchmarkWrites(b *testing.B, flush bool) { 22 | n, err := makeTempFilename() 23 | if err != nil { 24 | b.Fatal(err) 25 | } 26 | err = CreateMFile(n, 1<<23) // 8 MB 27 | if err != nil { 28 | b.Fatal(err) 29 | } 30 | defer os.Remove(n) 31 | m, err := OpenMFile(n, 1024) // Block Size: 1 KB 32 | if err != nil { 33 | b.Fatal(err) 34 | } 35 | defer m.Close() 36 | 37 | b.ResetTimer() 38 | for i := 0; i < b.N; i++ { 39 | // Cycle through the all the blocks in the file. 40 | // (file size / block size = 8192 blocks = 1<<13) 41 | m.WriteBlock(uint64(i%(1<<13)), []byte("some data and more")) 42 | if flush { 43 | m.Flush() 44 | } 45 | } 46 | } 47 | 48 | func BenchmarkWrites(b *testing.B) { 49 | benchmarkWrites(b, true) 50 | } 51 | 52 | func BenchmarkWritesNoFlush(b *testing.B) { 53 | benchmarkWrites(b, false) 54 | } 55 | 56 | func TestWrites(t *testing.T) { 57 | n, err := makeTempFilename() 58 | if err != nil { 59 | t.Fatal(err) 60 | } 61 | err = CreateMFile(n, 1<<23) // 8 MB 62 | if err != nil { 63 | t.Fatal(err) 64 | } 65 | defer os.Remove(n) 66 | m, err := OpenMFile(n, 1024) // Block Size: 1 KB 67 | if err != nil { 68 | t.Fatal(err) 69 | } 70 | defer m.Close() 71 | 72 | data := []byte("some data") 73 | m.WriteBlock(2, data) 74 | m.Flush() 75 | b := m.GetBlock(2) 76 | if !reflect.DeepEqual(b[:len(data)], data) { 77 | t.Fatal("Got useless data back") 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /storage/temp.go: -------------------------------------------------------------------------------- 1 | package storage 2 | 3 | import ( 4 | "sync" 5 | 6 | "golang.org/x/net/context" 7 | 8 | "github.com/coreos/torus" 9 | ) 10 | 11 | var _ torus.BlockStore = &tempBlockStore{} 12 | 13 | func init() { 14 | torus.RegisterBlockStore("temp", openTempBlockStore) 15 | } 16 | 17 | type tempBlockStore struct { 18 | mut sync.RWMutex 19 | store map[torus.BlockRef][]byte 20 | nBlocks uint64 21 | name string 22 | blockSize uint64 23 | } 24 | 25 | func openTempBlockStore(name string, cfg torus.Config, gmd torus.GlobalMetadata) (torus.BlockStore, error) { 26 | nBlocks := cfg.StorageSize / gmd.BlockSize 27 | promBlocksAvail.WithLabelValues(name).Set(float64(nBlocks)) 28 | promBlocks.WithLabelValues(name).Set(0) 29 | promBytesPerBlock.Set(float64(gmd.BlockSize)) 30 | return &tempBlockStore{ 31 | store: make(map[torus.BlockRef][]byte), 32 | nBlocks: nBlocks, 33 | name: name, 34 | blockSize: gmd.BlockSize, 35 | }, nil 36 | } 37 | 38 | func (t *tempBlockStore) Kind() string { return "temp" } 39 | func (t *tempBlockStore) Flush() error { return nil } 40 | func (t *tempBlockStore) BlockSize() uint64 { return t.blockSize } 41 | 42 | func (t *tempBlockStore) Close() error { 43 | t.mut.Lock() 44 | defer t.mut.Unlock() 45 | if t.store != nil { 46 | t.store = nil 47 | } 48 | return nil 49 | } 50 | 51 | func (t *tempBlockStore) NumBlocks() uint64 { 52 | t.mut.Lock() 53 | defer t.mut.Unlock() 54 | return t.nBlocks 55 | } 56 | 57 | func (t *tempBlockStore) UsedBlocks() uint64 { 58 | t.mut.Lock() 59 | defer t.mut.Unlock() 60 | return uint64(len(t.store)) 61 | } 62 | 63 | func (t *tempBlockStore) HasBlock(_ context.Context, s torus.BlockRef) (bool, error) { 64 | t.mut.Lock() 65 | defer t.mut.Unlock() 66 | _, ok := t.store[s] 67 | return ok, nil 68 | } 69 | 70 | func (t *tempBlockStore) GetBlock(_ context.Context, s torus.BlockRef) ([]byte, error) { 71 | t.mut.RLock() 72 | defer t.mut.RUnlock() 73 | 74 | if t.store == nil { 75 | promBlocksFailed.WithLabelValues(t.name).Inc() 76 | return nil, torus.ErrClosed 77 | } 78 | 79 | x, ok := t.store[s] 80 | if !ok { 81 | promBlocksFailed.WithLabelValues(t.name).Inc() 82 | return nil, torus.ErrBlockNotExist 83 | } 84 | promBlocksRetrieved.WithLabelValues(t.name).Inc() 85 | return x, nil 86 | } 87 | 88 | func (t *tempBlockStore) WriteBlock(_ context.Context, s torus.BlockRef, data []byte) error { 89 | t.mut.Lock() 90 | defer t.mut.Unlock() 91 | 92 | if t.store == nil { 93 | promBlockWritesFailed.WithLabelValues(t.name).Inc() 94 | return torus.ErrClosed 95 | } 96 | if int(t.nBlocks) <= len(t.store) { 97 | return torus.ErrOutOfSpace 98 | } 99 | buf := make([]byte, len(data)) 100 | copy(buf, data) 101 | t.store[s] = buf 102 | promBlocks.WithLabelValues(t.name).Set(float64(len(t.store))) 103 | promBlocksWritten.WithLabelValues(t.name).Inc() 104 | return nil 105 | } 106 | 107 | func (t *tempBlockStore) WriteBuf(_ context.Context, s torus.BlockRef) ([]byte, error) { 108 | t.mut.Lock() 109 | defer t.mut.Unlock() 110 | 111 | if t.store == nil { 112 | promBlockWritesFailed.WithLabelValues(t.name).Inc() 113 | return nil, torus.ErrClosed 114 | } 115 | if int(t.nBlocks) <= len(t.store) { 116 | return nil, torus.ErrOutOfSpace 117 | } 118 | buf := make([]byte, t.blockSize) 119 | t.store[s] = buf 120 | promBlocks.WithLabelValues(t.name).Set(float64(len(t.store))) 121 | promBlocksWritten.WithLabelValues(t.name).Inc() 122 | return buf, nil 123 | } 124 | 125 | func (t *tempBlockStore) DeleteBlock(_ context.Context, s torus.BlockRef) error { 126 | t.mut.Lock() 127 | defer t.mut.Unlock() 128 | 129 | if t.store == nil { 130 | promBlockDeletesFailed.WithLabelValues(t.name).Inc() 131 | return torus.ErrClosed 132 | } 133 | 134 | delete(t.store, s) 135 | promBlocks.WithLabelValues(t.name).Set(float64(len(t.store))) 136 | promBlocksDeleted.WithLabelValues(t.name).Inc() 137 | return nil 138 | } 139 | 140 | func (t *tempBlockStore) BlockIterator() torus.BlockIterator { 141 | t.mut.RLock() 142 | defer t.mut.RUnlock() 143 | blocks := make([]torus.BlockRef, 0, len(t.store)) 144 | for k := range t.store { 145 | blocks = append(blocks, k) 146 | } 147 | return &tempIterator{ 148 | blocks: blocks, 149 | index: -1, 150 | } 151 | } 152 | 153 | type tempIterator struct { 154 | blocks []torus.BlockRef 155 | index int 156 | } 157 | 158 | func (i *tempIterator) Err() error { return nil } 159 | 160 | func (i *tempIterator) Next() bool { 161 | i.index++ 162 | if i.index >= len(i.blocks) { 163 | return false 164 | } 165 | return true 166 | } 167 | 168 | func (i *tempIterator) BlockRef() torus.BlockRef { return i.blocks[i.index] } 169 | 170 | func (i *tempIterator) Close() error { return nil } 171 | -------------------------------------------------------------------------------- /version.go: -------------------------------------------------------------------------------- 1 | // Torus is a distributed storage system, allowing for the creation and 2 | // coordination of sharded, replicated files. For more details, see the README 3 | // at https://github.com/coreos/torus 4 | package torus 5 | 6 | // Version is set by build scripts, do not touch. 7 | var Version string 8 | --------------------------------------------------------------------------------