├── Makefile ├── README.md ├── bs3.png ├── config.toml ├── contrib └── systemd │ ├── bs3-gc.service │ ├── bs3-gc.timer │ └── bs3.service ├── go.mod ├── go.sum ├── internal ├── bs3 │ ├── bs3.go │ ├── doc.go │ ├── gc.go │ ├── key │ │ └── key.go │ ├── mapproxy │ │ ├── mapproxy.go │ │ └── sectormap │ │ │ └── sectormap.go │ └── objproxy │ │ ├── objproxy.go │ │ └── s3 │ │ └── s3.go ├── config │ └── config.go └── null │ └── null.go └── main.go /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY = install fmt tidy clean 2 | 3 | SOURCES := $(shell find . -name "*.go") 4 | SYSTEMD_UNITS := $(wildcard contrib/systemd/*) 5 | SYSTEMD_PATH := /etc/systemd/system 6 | SYSTEMD_CONTRIB_PATH := contrib/systemd 7 | 8 | bs3: $(SOURCES) 9 | go build 10 | 11 | install: bs3 $(SYSTEMD_UNITS) 12 | install -D bs3 /usr/local/bin/bs3 13 | install -D -m 600 config.toml /etc/bs3/config.toml 14 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3.service $(SYSTEMD_PATH)/bs3.service 15 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3-gc.service $(SYSTEMD_PATH)/bs3-gc.service 16 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3-gc.timer $(SYSTEMD_PATH)/bs3-gc.timer 17 | 18 | fmt: 19 | go fmt ./... 20 | 21 | tidy: 22 | go mod tidy 23 | 24 | clean: 25 | rm -f bs3 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BS3: Block Device in S3 2 | 3 | ## Write Performance Comparison 4 | 5 |

6 | 7 | ## Requirements 8 | 9 | * BUSE 10 | * GNU Make 11 | * Go 1.16 or newer 12 | 13 | ## Installation 14 | 15 | ``` 16 | make 17 | sudo make install 18 | ``` 19 | 20 | ## Usage 21 | 22 | ``` 23 | # Edit /etc/bs3/config.toml first 24 | 25 | systemctl start bs3 26 | systemctl status bs3 27 | systemctl stop bs3 28 | ``` 29 | -------------------------------------------------------------------------------- /bs3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asch/bs3/f1a0c34ff3511eadbdb57c9e7c94d743d24366e7/bs3.png -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | # Specify the major of the corresponding buse device you want to configure and 2 | # connect to. E.g. 0 if you want to work with /dev/buse0. 3 | major = 0 4 | 5 | # Number of user-space daemon threads which is also a maximal number of queues 6 | # storage stack uses. This is limited to the number of CPUs. I.e. minimal value 7 | # is 1 and maximal is number of CPUs. Optimally it should be set to the number 8 | # of CPUs. 0 means optimal value. 9 | threads = 0 10 | 11 | # Size of the created block device in GB. 12 | size = 8 #GB 13 | 14 | # Block size of created device. 512 or 4096. It is forbidden to change 15 | # block_size on the existent block device. In B. 16 | block_size = 4096 #B 17 | 18 | # Minimal IO size in Bytes. Useful for userspace raid configuration working 19 | # with chunks larger than block size. E.g. mdraid uses 512KB chunks size. 20 | io_min = 0 21 | 22 | # Optimal IO size in Bytes. Useful for userspace raid configuration working 23 | # with chunks larger than block size. E.g. mdraid uses 512KB chunks size. 24 | io_opt = 0 25 | 26 | # Whether IOs should be scheduled by linux kernel stack. 27 | scheduler = false 28 | 29 | # IO queue depth for created block device. 30 | queue_depth = 256 31 | 32 | # Use null backend, i.e. just immediately acknowledge reads and writes and drop 33 | # them. Useful for testing raw BUSE performance. Otherwise useless because all 34 | # data are lost. 35 | null = false 36 | 37 | # Enable web-based go pprof profiler for performance profiling. 38 | profiler = false 39 | 40 | # Profiler port. 41 | profiler_port = 6060 42 | 43 | # Configuration related to AWS S3 44 | [s3] 45 | # AWS Access Key 46 | access_key = "Server-Access-Key" 47 | 48 | # AWS Secret Key 49 | secret_key = "Server-Secret-Key" 50 | 51 | # Bucket where to store objects. 52 | bucket = "bs3" 53 | 54 | # ://: of the S3 backend. AWS S3 endpoint is used when empty string. 55 | remote = "" 56 | 57 | # Region to use. 58 | region = "us-east-1" 59 | 60 | # Max number of threads to spawn for uploads and downloads. 61 | uploaders = 384 62 | downloaders = 384 63 | 64 | # Configuration specific to write path. 65 | [write] 66 | # Semantics of the flush request. True means durable device, i.e. flush request 67 | # gets acknowledge when data are persisted on the backend. False means 68 | # eventually durable, i.e. flush request just a barrier. 69 | durable = false 70 | 71 | # Size of the shared memory between kernel and user space for data being 72 | # written. The size is per one thread. In MB. 73 | shared_buffer_size = 32 #MB 74 | 75 | # Size of the chunk. Chunk is the smallest piece which can be sent to the user 76 | # space and where all writes are stored. In MB. 77 | chunk_size = 4 #MB 78 | 79 | # The whole address space is divided into collision domains. Every collision 80 | # domain has its own counter for writes' sequential numbers. This is useful 81 | # when we don't want to have one shared counter for writes. Instead we split it 82 | # into parts and save the cache coherency protocol traffic. In MB. 83 | collision_chunk_size = 1 #MB 84 | 85 | # Configuration specific to read path. 86 | [read] 87 | 88 | # Size of the shared memory between kernel and user space for data being read. 89 | # The size is per one thread. In MB. 90 | shared_buffer_size = 32 #MB 91 | 92 | # Garbage Collection related configuration 93 | [gc] 94 | # Step when scanning the extent map. In blocks. 95 | step = 1024 96 | 97 | # Threshold for live data in the object. Objects under this threshold are 98 | # garbage collected by the "threshold GC" which is trigerred by SIGUSR1. This 99 | # type of GC is heavy on resources and should be planned by the timer for not 100 | # intense times. 101 | live_data = 0.3 102 | 103 | # Timeout to wait before any of requests from GC thread will be served by the 104 | # extent map and object manager. In ms. 105 | idle_timeout = 200 106 | 107 | # How many seconds to wait before next periodic GC round. This is related to 108 | # "dead GC" cleaning just dead objects. It very light on resources and does not 109 | # contend for the extent map like the "threshold GC". 110 | wait = 600 111 | 112 | # Configuration specific to the logger. 113 | [log] 114 | # Minimal level of logged messages. Following levels are provided: 115 | # panic 5, fatal 4, error 3, warn 2, info 1, debug 0, trace -1 116 | level = -1 117 | 118 | # Pretty print means nicer log output for human but much slower than non-pretty 119 | # json output. 120 | pretty = true 121 | -------------------------------------------------------------------------------- /contrib/systemd/bs3-gc.service: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | [Unit] 4 | Description=Run threshold GC for block device in s3 5 | 6 | [Service] 7 | Type=simple 8 | ExecStart=pkill -USR1 -f 'bs3 -c /etc/bs3/config.toml' 9 | -------------------------------------------------------------------------------- /contrib/systemd/bs3-gc.timer: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | [Unit] 4 | Description=Run threshold GC for block device in s3 every week 5 | 6 | [Timer] 7 | OnCalendar=weekly 8 | 9 | [Install] 10 | WantedBy=multi-user.target 11 | -------------------------------------------------------------------------------- /contrib/systemd/bs3.service: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | [Unit] 4 | Description=Block device in s3 5 | After=local-fs.target network-online.target 6 | 7 | [Service] 8 | Type=simple 9 | ExecStart=bs3 -c /etc/bs3/config.toml 10 | KillMode=mixed 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/asch/bs3 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9 7 | github.com/aws/aws-sdk-go v1.38.60 8 | github.com/ilyakaznacheev/cleanenv v1.2.5 9 | github.com/rs/zerolog v1.22.0 10 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b 11 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f // indirect 12 | ) 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= 2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 3 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9 h1:suBdWCu2BxNxC6YETUuFnYanAnffGtBBQ060v3rO4/A= 4 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9/go.mod h1:dxWl+7wjthiJ2JB8vNGTMy1FW7W8o2khYiGnYitxgms= 5 | github.com/aws/aws-sdk-go v1.38.60 h1:MgyEsX0IMwivwth1VwEnesBpH0vxbjp5a0w1lurMOXY= 6 | github.com/aws/aws-sdk-go v1.38.60/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= 7 | github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= 8 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 9 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 10 | github.com/ilyakaznacheev/cleanenv v1.2.5 h1:/SlcF9GaIvefWqFJzsccGG/NJdoaAwb7Mm7ImzhO3DM= 11 | github.com/ilyakaznacheev/cleanenv v1.2.5/go.mod h1:/i3yhzwZ3s7hacNERGFwvlhwXMDcaqwIzmayEhbRplk= 12 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= 13 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= 14 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= 15 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= 16 | github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc= 17 | github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= 18 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 19 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 20 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 21 | github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= 22 | github.com/rs/zerolog v1.22.0 h1:XrVUjV4K+izZpKXZHlPrYQiDtmdGiCylnT4i43AAWxg= 23 | github.com/rs/zerolog v1.22.0/go.mod h1:ZPhntP/xmq1nnND05hhpAh2QMhSsA4UN3MGZ6O2J3hM= 24 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 25 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 26 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 27 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 28 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 29 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 30 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 31 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 32 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 33 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 34 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b h1:k+E048sYJHyVnsr1GDrRZWQ32D2C7lWs9JRc0bel53A= 35 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 36 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 37 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 38 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 39 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 40 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 41 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 42 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 43 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 44 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f h1:rlezHXNlxYWvBCzNses9Dlc7nGFaNMJeqLolcmQSSZY= 45 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 46 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 48 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 49 | golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= 50 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 51 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 52 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 53 | golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= 54 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 55 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 56 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 57 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 58 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 59 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 60 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= 61 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 62 | olympos.io/encoding/edn v0.0.0-20200308123125-93e3b8dd0e24 h1:sreVOrDp0/ezb0CHKVek/l7YwpxPJqv+jT3izfSphA4= 63 | olympos.io/encoding/edn v0.0.0-20200308123125-93e3b8dd0e24/go.mod h1:oVgVk4OWVDi43qWBEyGhXgYxt7+ED4iYNpTngSLX2Iw= 64 | -------------------------------------------------------------------------------- /internal/bs3/bs3.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | package bs3 4 | 5 | import ( 6 | "encoding/binary" 7 | "sync" 8 | "time" 9 | 10 | "github.com/rs/zerolog/log" 11 | 12 | "github.com/asch/bs3/internal/bs3/key" 13 | "github.com/asch/bs3/internal/bs3/mapproxy" 14 | "github.com/asch/bs3/internal/bs3/mapproxy/sectormap" 15 | "github.com/asch/bs3/internal/bs3/objproxy" 16 | "github.com/asch/bs3/internal/bs3/objproxy/s3" 17 | "github.com/asch/bs3/internal/config" 18 | ) 19 | 20 | const ( 21 | // Size of the metadata for one write in the write chunk read from the 22 | // kernel. 23 | WRITE_ITEM_SIZE = 32 24 | 25 | // Key representing the object where serialized version of map is 26 | // stored. 27 | checkpointKey = -1 28 | 29 | // Typical number of extents per object for precise memory allocation 30 | // for return values. In the worst case reallocation happens. 31 | typicalExtentsPerObject = 128 32 | 33 | // Sector is a linux constant, which is always 512, no matter how big your sectors or blocks 34 | // are. Please be careful since the terminology is ambiguous. 35 | sectorUnit = 512 36 | ) 37 | 38 | // bs3 implements BuseReadWriter interface which can be passed to the buse 39 | // package. Buse package wraps the communication with the BUSE kernel module 40 | // and does all the necessary configuration and low level operations. 41 | // 42 | // bs3 uses s3 protocol to communicate with the storage backend (most probably 43 | // aws s3) but it can be anything else. It manages the mapping between local 44 | // device and remote backend and performs all the operations for correct 45 | // functionality. The default structure is sectormap but it can be changed 46 | // trivially. 47 | type bs3 struct { 48 | // Proxy struct for the operations on objects like uploads, downloads 49 | // etc. Proxy structs are used for serialization and prioritization of 50 | // requests. 51 | objectStoreProxy objproxy.ObjectProxy 52 | 53 | // Proxy struct for the operations on extent map like updates, lookups 54 | // etc. Proxy structs are used for serialization and prioritization of 55 | // requests. 56 | extentMapProxy mapproxy.ExtentMapProxy 57 | 58 | // Data private to the garbage collection process. 59 | gcData struct { 60 | // Reference counter of objects which are actually downloaded 61 | // and hence cannot be deleted from the storage backend. 62 | refcounter map[int64]int64 63 | 64 | // Lock guarding the refcounter. 65 | reflock sync.Mutex 66 | } 67 | 68 | // Size of the metadata for one write in the write chunk read from the 69 | // kernel. 70 | write_item_size int 71 | 72 | // Size of the object portion which contains all writes metadata in the 73 | // chunk from the kernel. After this metadata_size offset real data are 74 | // stored. 75 | metadata_size int 76 | } 77 | 78 | // Returns bs3 with default configuration, i.e. with s3 as a communication 79 | // protocol and sectormap as an extent map. 80 | func NewWithDefaults() (*bs3, error) { 81 | s3Handler, err := s3.New(s3.Options{ 82 | Remote: config.Cfg.S3.Remote, 83 | Region: config.Cfg.S3.Region, 84 | AccessKey: config.Cfg.S3.AccessKey, 85 | SecretKey: config.Cfg.S3.SecretKey, 86 | Bucket: config.Cfg.S3.Bucket, 87 | }) 88 | 89 | if err != nil { 90 | return nil, err 91 | } 92 | 93 | mapSize := config.Cfg.Size / int64(config.Cfg.BlockSize) 94 | bs3 := New(s3Handler, sectormap.New(mapSize)) 95 | 96 | return bs3, nil 97 | } 98 | 99 | // Returns bs3 with provided protocol for communication with backend storage 100 | // and extentMap for keeping the mapping between local device and remote 101 | // backend. 102 | func New(objectStore objproxy.ObjectUploadDownloaderAt, extentMap mapproxy.ExtentMapper) *bs3 { 103 | bs3 := bs3{ 104 | objectStoreProxy: objproxy.New( 105 | objectStore, config.Cfg.S3.Uploaders, config.Cfg.S3.Downloaders, 106 | time.Duration(config.Cfg.GC.IdleTimeoutMs)*time.Millisecond), 107 | 108 | extentMapProxy: mapproxy.New( 109 | extentMap, time.Duration(config.Cfg.GC.IdleTimeoutMs)*time.Millisecond), 110 | 111 | metadata_size: config.Cfg.Write.ChunkSize / config.Cfg.BlockSize * WRITE_ITEM_SIZE, 112 | 113 | write_item_size: WRITE_ITEM_SIZE, 114 | } 115 | 116 | bs3.gcData.refcounter = make(map[int64]int64) 117 | 118 | return &bs3 119 | } 120 | 121 | // Handle writes comming from the buse library. writes contain number write 122 | // commands in this call and chunk contains memory where these commands are 123 | // stored together with their data. First part of the chunk are metadata, until 124 | // metadata_size and the rest are data of all writes in the same order. 125 | // 126 | // We read all the writes metadata, create a list and pass it to the extent map 127 | // to update the mapping. Before we actually do that, we wait until the whole 128 | // chunk us uploaded with generated key, which is just one more than the 129 | // previous one. 130 | func (b *bs3) BuseWrite(writes int64, chunk []byte) error { 131 | key := key.Next() 132 | 133 | metadata := chunk[:b.metadata_size] 134 | extents := make([]mapproxy.Extent, writes) 135 | 136 | var writtenTotalBlocks uint64 137 | for i := int64(0); i < writes; i++ { 138 | e := parseExtent(metadata[:b.write_item_size]) 139 | extents[i] = e 140 | metadata = metadata[b.write_item_size:] 141 | writtenTotalBlocks += uint64(e.Length) 142 | } 143 | 144 | // Zero out the rest of the space reserved for writes. This is because 145 | // of recovery process, where we lose information about size of the 146 | // metadata. 147 | for i := 0; i < len(metadata); i++ { 148 | metadata[i] = 0 149 | } 150 | 151 | dataSize := writtenTotalBlocks * uint64(config.Cfg.BlockSize) 152 | object := chunk[:uint64(b.metadata_size)+dataSize] 153 | 154 | // Some s3 backends, like minio just drops connection when they are 155 | // under load. Hence the loop with exponential backoff till the 156 | // operation succeeds. There is no point to return error, since the 157 | // best thing we can do is to try infinitely and print a message to 158 | // log. 159 | for i := 1; ; i *= 2 { 160 | err := b.objectStoreProxy.Upload(key, object, true) 161 | if err == nil { 162 | break 163 | } 164 | log.Info().Err(err).Send() 165 | time.Sleep(time.Duration(i) * time.Second) 166 | } 167 | 168 | b.extentMapProxy.Update(extents, int64(b.metadata_size/config.Cfg.BlockSize), key) 169 | 170 | return nil 171 | } 172 | 173 | // Download part of the object to the memory buffer chunk. The part is 174 | // specified by part and it is necessary to call wg.Done() when the upload is 175 | // finished. 176 | func (b *bs3) downloadObjectPart(part mapproxy.ObjectPart, chunk []byte, wg *sync.WaitGroup) { 177 | defer wg.Done() 178 | 179 | // Some s3 backends, like minio just drops connection when they are 180 | // under load. Hence the loop with exponential backoff till the 181 | // operation succeeds. There is no point to return error, since the 182 | // best thing we can do is to try infinitely and print a message to 183 | // log. 184 | for i := 1; ; i *= 2 { 185 | err := b.objectStoreProxy.Download(part.Key, chunk, part.Sector*int64(config.Cfg.BlockSize), true) 186 | if err == nil { 187 | break 188 | } 189 | log.Info().Err(err).Send() 190 | time.Sleep(time.Duration(i) * time.Second) 191 | } 192 | } 193 | 194 | // Read extent starting at sector with length length to the buffer chunk. 195 | // Length of the chunk is the same as length variable. This function consults 196 | // the extent map and asynchronously downloads all needed pieces to reconstruct 197 | // the logical extent. 198 | func (b *bs3) BuseRead(sector, length int64, chunk []byte) error { 199 | objectPieces := b.getObjectPiecesRefCounterInc(sector, length) 200 | 201 | var wg sync.WaitGroup 202 | for _, op := range objectPieces { 203 | size := op.Length * int64(config.Cfg.BlockSize) 204 | if op.Key != mapproxy.NotMappedKey { 205 | wg.Add(1) 206 | go b.downloadObjectPart(op, chunk[:size], &wg) 207 | } 208 | chunk = chunk[size:] 209 | } 210 | 211 | wg.Wait() 212 | 213 | b.objectPiecesRefCounterDec(objectPieces) 214 | 215 | return nil 216 | } 217 | 218 | // Before buse library communicating with the kernel starts, we restore map 219 | // stored on the backend and register signal handler of SIGUSR1 which servers 220 | // for threshold garbage collection. Then we run infinite loop with garbage 221 | // collection deleting just completely dead objects withou any data. It is very 222 | // fast and efficiet and has a huge impact on the backend space utilization. 223 | // Hence we run it continuously. 224 | func (b *bs3) BusePreRun() { 225 | if !config.Cfg.SkipCheckpoint { 226 | b.restore() 227 | } 228 | 229 | b.registerSigUSR1Handler() 230 | 231 | go b.gcDead() 232 | } 233 | 234 | // After disconnecting from the kernel module and just before shuting the 235 | // daemon down we save the map to the backend so it can be restored during next 236 | // start and mapping is not lost. 237 | func (b *bs3) BusePostRemove() { 238 | if !config.Cfg.SkipCheckpoint { 239 | b.checkpoint() 240 | } 241 | } 242 | 243 | // Returns object pieces for reconstructing logical extent but before that 244 | // safely increments the refcounter for the objects. Objects in refcounter are 245 | // excluded from garbage collection. 246 | func (b *bs3) getObjectPiecesRefCounterInc(sector, length int64) []mapproxy.ObjectPart { 247 | b.gcData.reflock.Lock() 248 | defer b.gcData.reflock.Unlock() 249 | 250 | objectPieces := b.extentMapProxy.Lookup(int64(sector), int64(length)) 251 | 252 | for _, op := range objectPieces { 253 | b.gcData.refcounter[op.Key]++ 254 | } 255 | 256 | return objectPieces 257 | } 258 | 259 | // Decrements the refcounter for the object pieces. Objects in refcounter are 260 | // excluded from garbage collection. 261 | func (b *bs3) objectPiecesRefCounterDec(objectPieces []mapproxy.ObjectPart) { 262 | b.gcData.reflock.Lock() 263 | 264 | for _, op := range objectPieces { 265 | b.gcData.refcounter[op.Key]-- 266 | } 267 | 268 | b.gcData.reflock.Unlock() 269 | } 270 | 271 | // Restores the map from the checkpoint saved on the backend and updates the 272 | // current object key accordingly. If it exists. 273 | func (b *bs3) restoreFromCheckpoint() { 274 | mapSize, err := b.objectStoreProxy.Instance.GetObjectSize(checkpointKey) 275 | if err == nil { 276 | log.Info().Msg("->Checkpoint found. Checkpoint recovery started.") 277 | 278 | compressedMap := make([]byte, mapSize) 279 | b.objectStoreProxy.Download(checkpointKey, compressedMap, 0, false) 280 | newKey := b.extentMapProxy.Instance.DeserializeAndReturnNextKey(compressedMap) 281 | key.Replace(newKey) 282 | 283 | log.Info().Msgf("->Checkpoint recovery process finished. Last object from checkpoint is %d.", newKey) 284 | } 285 | } 286 | 287 | // Restores the map from individual objects. It reconstructs the map replaying 288 | // all the writes from metadata part of continuous sequence of objects until a 289 | // missing object is found. This is the point where prefix consistency is 290 | // corrupted and we cannot recover more. Any successive objects are deleted. 291 | func (b *bs3) restoreFromObjects() { 292 | log.Info().Msg("->Looking for objects to do roll forward recovery.") 293 | 294 | keyBefore := key.Current() 295 | for ; ; key.Next() { 296 | header := make([]byte, b.metadata_size) 297 | size, err := b.objectStoreProxy.Instance.GetObjectSize(key.Current()) 298 | if err != nil { 299 | // Prefix consistency broken. 300 | break 301 | } 302 | if size == 0 { 303 | // Garbage collected object, that is OK, prefix 304 | // consistency kept. 305 | continue 306 | } 307 | 308 | // Get writes metadata for object. 309 | err = b.objectStoreProxy.Instance.DownloadAt(key.Current(), header, 0) 310 | if err != nil { 311 | break 312 | } 313 | 314 | // Replay all writes from metadata part until extent with 315 | // length 0 is found. It is invalid value and it means that the 316 | // memory is zeroed, which means end of the metadata section of 317 | // the object. The memory is zeroed out in BuseWrite function 318 | // where the object is uploaded. 319 | extents := make([]mapproxy.Extent, 0, typicalExtentsPerObject) 320 | for { 321 | e := parseExtent(header[:b.write_item_size]) 322 | if e.Length == 0 { 323 | break 324 | } 325 | extents = append(extents, e) 326 | header = header[b.write_item_size:] 327 | } 328 | 329 | dataBegin := int64(b.metadata_size / config.Cfg.BlockSize) 330 | b.extentMapProxy.Update(extents, dataBegin, key.Current()) 331 | } 332 | 333 | if keyBefore == key.Current() { 334 | log.Info().Msg("->No extra objects found for roll forward recovery.") 335 | } else { 336 | log.Info().Msgf("->Extra %d objects for roll forward recovery found.", key.Current()-keyBefore) 337 | } 338 | } 339 | 340 | // Restores map from saved checkpoint and then continuous in restoration from 341 | // individual objects. E.g. when crash happens, checkpoint is not uploaded 342 | // hence the old checkpoint is read. However there can already be uploaded new 343 | // set of objects fulfilling prefix consistency. 344 | func (b *bs3) restore() { 345 | log.Info().Msgf("Checking for old volume in bucket %s.", config.Cfg.S3.Bucket) 346 | 347 | b.restoreFromCheckpoint() 348 | b.restoreFromObjects() 349 | b.objectStoreProxy.Instance.DeleteKeyAndSuccessors(key.Current()) 350 | 351 | if key.Current() == 0 { 352 | log.Info().Msgf("No volume found. Bucket %s is used for new volume.", config.Cfg.S3.Bucket) 353 | } else { 354 | log.Info().Msgf("Volume found in bucket %s. The last object is %d.", config.Cfg.S3.Bucket, key.Current()) 355 | } 356 | } 357 | 358 | // Serializes extent map and upload it to the backend. 359 | func (b *bs3) checkpoint() { 360 | log.Info().Msg("Checkpointing started.") 361 | 362 | log.Info().Msg("->Serialization of extent map started.") 363 | dump := b.extentMapProxy.Instance.Serialize() 364 | log.Info().Msg("->Serialization of extent map finished.") 365 | 366 | log.Info().Msg("->Upload of extent map started.") 367 | b.objectStoreProxy.Upload(checkpointKey, dump, false) 368 | log.Info().Msg("->Upload of extent map finished.") 369 | 370 | log.Info().Msgf("Checkpointing finished. Last checkpointed object is %d.", key.Current()) 371 | } 372 | 373 | // Parses write extent information from 32 bytes of raw memory. The memory is 374 | // one write in metadata section of the object. 375 | func parseExtent(b []byte) mapproxy.Extent { 376 | return mapproxy.Extent{ 377 | Sector: int64(binary.LittleEndian.Uint64(b[:8]) * sectorUnit / uint64(config.Cfg.BlockSize)), 378 | Length: int64(binary.LittleEndian.Uint64(b[8:16]) * sectorUnit / uint64(config.Cfg.BlockSize)), 379 | SeqNo: int64(binary.LittleEndian.Uint64(b[16:24])), 380 | Flag: int64(binary.LittleEndian.Uint64(b[24:32])), 381 | } 382 | } 383 | -------------------------------------------------------------------------------- /internal/bs3/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // bs3 is a userspace daemon using golang buse library to create a block 4 | // device. All operations on the block device are handled by the daemon. bs3 5 | // stores data in object storage via s3 protocol and maintains the mapping 6 | // between logical block device space and the backend. 7 | // 8 | // bs3 defines two interfaces. One for the extent map and one for the storage 9 | // backend operations. These two parts can be trivially changed just by 10 | // implementing corresponding interface. 11 | package bs3 12 | -------------------------------------------------------------------------------- /internal/bs3/gc.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | package bs3 4 | 5 | import ( 6 | "encoding/binary" 7 | "os" 8 | "os/signal" 9 | "sync" 10 | "syscall" 11 | "time" 12 | 13 | "github.com/asch/bs3/internal/bs3/key" 14 | "github.com/asch/bs3/internal/bs3/mapproxy" 15 | "github.com/asch/bs3/internal/config" 16 | 17 | "github.com/rs/zerolog/log" 18 | ) 19 | 20 | const ( 21 | // Typical number of newly created objects during one threshold GC run. 22 | // Just an optimization of memory allocation, in the worst case 23 | // reallocation occurs. 24 | typicalNewObjectsPerGC = 64 25 | 26 | // Typical number of extents per one garbage collected object. Just an 27 | // optimization of memory allocation, in the worst case reallocation 28 | // occurs. 29 | typicalExtentsPerGCObject = 64 30 | ) 31 | 32 | // Select objects viable for threshold GC. When an object utilization is under 33 | // the threshold it is selected for GC. The object with the highest key is 34 | // never collected because of oscilation. 35 | func (b *bs3) filterKeysToCollect(utilization map[int64]int64, ratio float64) map[int64]struct{} { 36 | var maxKey int64 37 | collect := make(map[int64]struct{}) 38 | 39 | for k, v := range utilization { 40 | used := v * int64(config.Cfg.BlockSize) 41 | r := float64(used) / float64(config.Cfg.Write.ChunkSize) 42 | if r < ratio { 43 | collect[k] = struct{}{} 44 | } 45 | 46 | if k > maxKey { 47 | maxKey = k 48 | } 49 | } 50 | 51 | if _, ok := collect[maxKey]; ok { 52 | delete(collect, maxKey) 53 | } 54 | 55 | return collect 56 | } 57 | 58 | // Constructs the list of life extents to be saved from objects subjected to the GC. 59 | func (b *bs3) getCompleteWriteList(keys map[int64]struct{}, stepSize int64) []mapproxy.ExtentWithObjectPart { 60 | completeWriteList := make([]mapproxy.ExtentWithObjectPart, 0, 128) 61 | 62 | sectors := config.Cfg.Size / int64(config.Cfg.BlockSize) 63 | 64 | for i := int64(0); i < sectors; i += stepSize { 65 | ci := b.extentMapProxy.ExtentsInObjects(int64(i), stepSize, keys) 66 | 67 | if len(ci) == 0 { 68 | continue 69 | } 70 | 71 | completeWriteList = append(completeWriteList, ci...) 72 | 73 | } 74 | 75 | return completeWriteList 76 | } 77 | 78 | // Removes currently downloaded objects from the list of dead objects. 79 | func (b *bs3) filterDownloadingObjects(deadObjects map[int64]struct{}) { 80 | b.gcData.reflock.Lock() 81 | defer b.gcData.reflock.Unlock() 82 | 83 | for k, v := range b.gcData.refcounter { 84 | if v == 0 { 85 | delete(b.gcData.refcounter, k) 86 | } else { 87 | _, ok := deadObjects[k] 88 | if ok { 89 | delete(deadObjects, k) 90 | } 91 | } 92 | } 93 | } 94 | 95 | // Runs threshold GC. It makes all objects with live data ratio under the 96 | // threshold dead by copying their live data into new object. These objects are 97 | // deleted during the regular dead GC run. 98 | func (b *bs3) gcThreshold(stepSize int64, threshHold float64) { 99 | liveObjects := b.extentMapProxy.ObjectsUtilization() 100 | keysToCollect := b.filterKeysToCollect(liveObjects, threshHold) 101 | completeWritelist := b.getCompleteWriteList(keysToCollect, stepSize) 102 | objects, extents := b.composeObjects(completeWritelist) 103 | 104 | for i := range objects { 105 | key := key.Next() 106 | 107 | err := b.objectStoreProxy.Upload(key, objects[i], false) 108 | if err != nil { 109 | log.Info().Err(err).Send() 110 | } 111 | 112 | b.extentMapProxy.Update(extents[i], int64(b.metadata_size/config.Cfg.BlockSize), key) 113 | } 114 | } 115 | 116 | // Removes unneeded dead objects from the map and upload empty object instead. 117 | // The object cannot be deleted on the backend, because the sequence number 118 | // would be missing in the recovery process where we need continuous range of 119 | // keys. 120 | func (b *bs3) removeNonReferencedDeadObjects() { 121 | deadObjects := b.extentMapProxy.DeadObjects() 122 | b.filterDownloadingObjects(deadObjects) 123 | for k := range deadObjects { 124 | err := b.objectStoreProxy.Upload(k, []byte{}, false) 125 | if err != nil { 126 | log.Info().Err(err).Send() 127 | } 128 | } 129 | b.extentMapProxy.DeleteDeadObjects(deadObjects) 130 | } 131 | 132 | // Register SIGUSR1 as a trigger for threshold GC. 133 | func (b *bs3) registerSigUSR1Handler() { 134 | gcChan := make(chan os.Signal, 1) 135 | signal.Notify(gcChan, syscall.SIGUSR1) 136 | 137 | go func() { 138 | for range gcChan { 139 | log.Info().Msgf("Threshold GC started with threshold %1.2f.", config.Cfg.GC.LiveData) 140 | b.gcThreshold(config.Cfg.GC.Step, config.Cfg.GC.LiveData) 141 | log.Info().Msg("Threshold GC finished.") 142 | } 143 | }() 144 | } 145 | 146 | // Dead GC infinite loop. Highly efficient hence running regularly. 147 | func (b *bs3) gcDead() { 148 | for { 149 | time.Sleep(time.Duration(config.Cfg.GC.Wait) * time.Second) 150 | 151 | log.Trace().Msg("Dead GC started.") 152 | b.removeNonReferencedDeadObjects() 153 | log.Trace().Msg("Dead GC finished.") 154 | } 155 | } 156 | 157 | // Stores raw values of individual write into metadata part of the object. 158 | func writeHeader(metadataFrontier int, g mapproxy.ExtentWithObjectPart, object []byte) { 159 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.ObjectPart.Sector)) 160 | metadataFrontier += 8 161 | 162 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.Length)) 163 | metadataFrontier += 8 164 | 165 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.SeqNo)) 166 | metadataFrontier += 8 167 | 168 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.Flag)) 169 | metadataFrontier += 8 170 | } 171 | 172 | // Traverse the list of all extents which are going to be copied into new fresh 173 | // object(s). It downloads necessary parts and constructs new objects for the 174 | // complete list. All objects are then uploaded and map updated. 175 | func (b *bs3) composeObjects(writeList []mapproxy.ExtentWithObjectPart) ([][]byte, [][]mapproxy.Extent) { 176 | var wg sync.WaitGroup 177 | 178 | metadataFrontier := 0 179 | dataFrontier := b.metadata_size 180 | 181 | objects := make([][]byte, 0, typicalNewObjectsPerGC) 182 | extents := make([][]mapproxy.Extent, 0, typicalNewObjectsPerGC) 183 | 184 | object := make([]byte, config.Cfg.Write.ChunkSize) 185 | currentObjectExtents := make([]mapproxy.Extent, 0, typicalExtentsPerGCObject) 186 | 187 | for _, g := range writeList { 188 | if uint64(dataFrontier)+uint64(g.Extent.Length)*uint64(config.Cfg.BlockSize) > uint64(config.Cfg.Write.ChunkSize) { 189 | objects = append(objects, object) 190 | extents = append(extents, currentObjectExtents) 191 | 192 | object = make([]byte, config.Cfg.Write.ChunkSize) 193 | currentObjectExtents = make([]mapproxy.Extent, 0, typicalExtentsPerGCObject) 194 | 195 | metadataFrontier = 0 196 | dataFrontier = b.metadata_size 197 | } 198 | 199 | writeHeader(metadataFrontier, g, object) 200 | metadataFrontier += b.write_item_size 201 | 202 | data := object[dataFrontier : int64(dataFrontier)+g.Extent.Length*int64(config.Cfg.BlockSize)] 203 | wg.Add(1) 204 | go func(g mapproxy.ExtentWithObjectPart) { 205 | defer wg.Done() 206 | err := b.objectStoreProxy.Download(g.ObjectPart.Key, data, g.Extent.Sector*int64(config.Cfg.BlockSize), true) 207 | if err != nil { 208 | log.Info().Err(err).Send() 209 | } 210 | }(g) 211 | 212 | extent := mapproxy.Extent{ 213 | Sector: g.ObjectPart.Sector, 214 | Length: g.Extent.Length, 215 | SeqNo: g.Extent.SeqNo, 216 | Flag: g.Extent.Flag, 217 | } 218 | 219 | currentObjectExtents = append(currentObjectExtents, extent) 220 | dataFrontier += int(g.Extent.Length) * config.Cfg.BlockSize 221 | } 222 | 223 | if len(currentObjectExtents) > 0 { 224 | objects = append(objects, object) 225 | extents = append(extents, currentObjectExtents) 226 | } 227 | 228 | wg.Wait() 229 | 230 | return objects, extents 231 | } 232 | -------------------------------------------------------------------------------- /internal/bs3/key/key.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Package for synchronized access to the object key counter. 4 | package key 5 | 6 | import ( 7 | "sync" 8 | ) 9 | 10 | var ( 11 | key int64 12 | mutex sync.Mutex 13 | ) 14 | 15 | // Returns value of currently unassigned key. It is forbidden to use this key 16 | // for creating a new object withou calling Next() function. I.e. this key can 17 | // be used for the next object. 18 | func Current() int64 { 19 | mutex.Lock() 20 | defer mutex.Unlock() 21 | 22 | return key 23 | } 24 | 25 | // Returns value of currently unassigned key and increments, hence the key 26 | // variable contains unassigned key again.. I.e. this key can be used for the 27 | // next object. 28 | func Next() int64 { 29 | mutex.Lock() 30 | defer mutex.Unlock() 31 | 32 | tmp := key 33 | key++ 34 | 35 | return tmp 36 | } 37 | 38 | // Replaces the value of the next unassigned key. 39 | func Replace(newKey int64) { 40 | mutex.Lock() 41 | defer mutex.Unlock() 42 | 43 | key = newKey 44 | } 45 | -------------------------------------------------------------------------------- /internal/bs3/mapproxy/mapproxy.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Mapproxy package is a proxy for structs with ExtentMapper interface. It 4 | // serializes and prioritizes requests coming to the ExtentMapper and also 5 | // improves cache locality since all operations are done by the same go 6 | // routine. 7 | package mapproxy 8 | 9 | import ( 10 | "time" 11 | ) 12 | 13 | const ( 14 | NotMappedKey = -1 15 | ) 16 | 17 | // Provides mapping from logical extents presented in the system to the 18 | // potentionaly mutliple extents in the backend storage. Furthermore it has to 19 | // be provide multiple operations related to garbage collection and map 20 | // restoration. 21 | type ExtentMapper interface { 22 | Update(extents []Extent, startOfDataSectors, key int64) 23 | Lookup(sector, length int64) []ObjectPart 24 | FindExtentsWithKeys(sector, length int64, keys map[int64]struct{}) []ExtentWithObjectPart 25 | DeleteFromDeadObjects(deadObjects map[int64]struct{}) 26 | DeleteFromUtilization(keys map[int64]struct{}) 27 | GetMaxKey() int64 28 | ObjectsUtilization() map[int64]int64 29 | DeadObjects() map[int64]struct{} 30 | DeserializeAndReturnNextKey(buf []byte) int64 31 | Serialize() []byte 32 | } 33 | 34 | // Proxy to the ExtentMapper. It serializes and prioritizes requests comming to 35 | // the extent map and also improves cache locality since the map is always 36 | // traversed by the same thread. 37 | type ExtentMapProxy struct { 38 | Instance ExtentMapper 39 | 40 | // Timeout after which low priority request can be handled. 41 | idleTimeout time.Duration 42 | 43 | // Channels for internal communication specific to one type of request. 44 | updateChan chan updateRequest 45 | lookupChan chan lookupRequest 46 | keyedExtentsChan chan keyedExtentsRequest 47 | 48 | // General low priority channel used for multiple types of requests. 49 | lockChan chan lockRequest 50 | } 51 | 52 | // Mapping from the logical extent to the extent in the object. 53 | type ExtentWithObjectPart struct { 54 | Extent Extent 55 | ObjectPart ObjectPart 56 | } 57 | 58 | // Logical extent representation representing the system view. 59 | type Extent struct { 60 | // Beginnig of the extent. 61 | Sector int64 62 | 63 | // Length of the extent. Extent is continuous. 64 | Length int64 65 | 66 | // Sequential number of write which wrote this extent 67 | SeqNo int64 68 | 69 | // Reserved for future usage. 70 | Flag int64 71 | } 72 | 73 | // Object part is extent in the object. 74 | type ObjectPart struct { 75 | // First sector of the extent. 76 | Sector int64 77 | 78 | // Length of the extent. Extent is continuous. 79 | Length int64 80 | 81 | // Object where the extent is located. 82 | Key int64 83 | } 84 | 85 | // Returns proxy which can be directly used. It spawns one worker which handles 86 | // all serialized and prioritized requests. 87 | func New(instance ExtentMapper, idleTimeout time.Duration) ExtentMapProxy { 88 | updateChan := make(chan updateRequest) 89 | lookupChan := make(chan lookupRequest) 90 | keyedExtentsChan := make(chan keyedExtentsRequest) 91 | lockChan := make(chan lockRequest) 92 | 93 | m := ExtentMapProxy{ 94 | Instance: instance, 95 | idleTimeout: idleTimeout, 96 | updateChan: updateChan, 97 | lookupChan: lookupChan, 98 | keyedExtentsChan: keyedExtentsChan, 99 | lockChan: lockChan, 100 | } 101 | 102 | go m.worker() 103 | 104 | return m 105 | } 106 | 107 | // Updates all extents specified in extents. startOfDataSectors is the first 108 | // sector in the object with real data and key is the key of the object. 109 | func (p *ExtentMapProxy) Update(extents []Extent, startOfDataSectors, key int64) { 110 | done := make(chan struct{}) 111 | p.updateChan <- updateRequest{extents, startOfDataSectors, key, done} 112 | <-done 113 | } 114 | 115 | // Finds all pieces from which the logical extent starting from sector with 116 | // length length can be reconstructed. 117 | func (p *ExtentMapProxy) Lookup(sector, length int64) []ObjectPart { 118 | reply := make(chan []ObjectPart) 119 | p.lookupChan <- lookupRequest{sector, length, reply} 120 | return <-reply 121 | } 122 | 123 | // Finds all extents which are stored in any of the objects with keys in keys. 124 | // Sector and length is the range of interest. 125 | func (p *ExtentMapProxy) ExtentsInObjects(sector, length int64, keys map[int64]struct{}) []ExtentWithObjectPart { 126 | reply := make(chan []ExtentWithObjectPart) 127 | p.keyedExtentsChan <- keyedExtentsRequest{sector, length, keys, reply} 128 | return <-reply 129 | } 130 | 131 | // Returns all dead objects. I.e. objects without any live data. 132 | func (p *ExtentMapProxy) DeadObjects() map[int64]struct{} { 133 | done := make(chan struct{}) 134 | p.lockChan <- lockRequest{done} 135 | tmp := p.Instance.DeadObjects() 136 | <-done 137 | 138 | return tmp 139 | } 140 | 141 | // Returns all objects utilization. I.e. number of non-dead sectors in each 142 | // non-dead object. 143 | func (p *ExtentMapProxy) ObjectsUtilization() map[int64]int64 { 144 | done := make(chan struct{}) 145 | p.lockChan <- lockRequest{done} 146 | tmp := p.Instance.ObjectsUtilization() 147 | <-done 148 | 149 | return tmp 150 | } 151 | 152 | // Returns highest object key contained in the map. 153 | func (p *ExtentMapProxy) GetMaxKey() int64 { 154 | done := make(chan struct{}) 155 | p.lockChan <- lockRequest{done} 156 | tmp := p.Instance.GetMaxKey() 157 | <-done 158 | 159 | return tmp 160 | 161 | } 162 | 163 | // Deletes all provided keys from object utilization list. 164 | func (p *ExtentMapProxy) DeleteFromUtilization(keys map[int64]struct{}) { 165 | done := make(chan struct{}) 166 | p.lockChan <- lockRequest{done} 167 | defer func() { 168 | <-done 169 | }() 170 | 171 | p.Instance.DeleteFromUtilization(keys) 172 | } 173 | 174 | // Deletes all dead objects from dead objects list. 175 | func (p *ExtentMapProxy) DeleteDeadObjects(deadObjects map[int64]struct{}) { 176 | done := make(chan struct{}) 177 | p.lockChan <- lockRequest{done} 178 | defer func() { 179 | <-done 180 | }() 181 | 182 | p.Instance.DeleteFromDeadObjects(deadObjects) 183 | } 184 | 185 | type updateRequest struct { 186 | extents []Extent 187 | startOfDataSectors int64 188 | key int64 189 | done chan struct{} 190 | } 191 | 192 | // Internal request structures just for wrapping the function calls into the 193 | // channel communication. 194 | 195 | type lookupRequest struct { 196 | sector int64 197 | length int64 198 | reply chan []ObjectPart 199 | } 200 | 201 | type keyedExtentsRequest struct { 202 | sector int64 203 | length int64 204 | keys map[int64]struct{} 205 | reply chan<- []ExtentWithObjectPart 206 | } 207 | 208 | type lockRequest struct { 209 | done chan struct{} 210 | } 211 | 212 | // Worker is doing prioritization and serialization of the requests. Updates 213 | // and lookups into the map have highest priority. All other request are low 214 | // priority. 215 | func (p *ExtentMapProxy) worker() { 216 | for { 217 | select { 218 | case u := <-p.updateChan: 219 | p.update(u) 220 | 221 | case l := <-p.lookupChan: 222 | p.lookup(l) 223 | 224 | //case <-time.NewTicker(m.idleTimeout).C: 225 | default: 226 | select { 227 | case u := <-p.updateChan: 228 | p.update(u) 229 | 230 | case l := <-p.lookupChan: 231 | p.lookup(l) 232 | 233 | case e := <-p.keyedExtentsChan: 234 | p.findExtensWithKeys(e) 235 | 236 | case l := <-p.lockChan: 237 | l.done <- struct{}{} 238 | } 239 | } 240 | } 241 | } 242 | 243 | func (p *ExtentMapProxy) update(r updateRequest) { 244 | p.Instance.Update(r.extents, r.startOfDataSectors, r.key) 245 | r.done <- struct{}{} 246 | } 247 | 248 | func (p *ExtentMapProxy) lookup(r lookupRequest) { 249 | r.reply <- p.Instance.Lookup(r.sector, r.length) 250 | } 251 | 252 | func (p *ExtentMapProxy) findExtensWithKeys(r keyedExtentsRequest) { 253 | r.reply <- p.Instance.FindExtentsWithKeys(r.sector, r.length, r.keys) 254 | } 255 | -------------------------------------------------------------------------------- /internal/bs3/mapproxy/sectormap/sectormap.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Sectormap package provides implementation of ExtentMapper interface. It 4 | // implements high efficient mapping with sector granularity. More details are 5 | // in the SectorMap struct description. 6 | package sectormap 7 | 8 | import ( 9 | "bytes" 10 | "encoding/gob" 11 | 12 | "github.com/asch/bs3/internal/bs3/mapproxy" 13 | ) 14 | 15 | const ( 16 | // How many objects parts is the typical result for one extent lookup. 17 | // This is just for initial allocation of the returned array. In the 18 | // worst case reallocation happens. 19 | typicalObjectPartsPerLookup = 64 20 | 21 | notMappedKey = -1 22 | ) 23 | 24 | // Description of the sector. It provides information about corresponding 25 | // sector in the object and object identification. 26 | type SectorMetadata struct { 27 | // Sector in the object. 28 | Sector int64 29 | 30 | // Key of the object. 31 | Key int64 32 | 33 | // Sequential number of the last write to this sector. 34 | SeqNo int64 35 | 36 | // Reserved for future usage. 37 | Flag int64 38 | } 39 | 40 | // Implementation of the ExtentMapper interface hence serving as and extent map. This is high 41 | // efficient mapping of individual sectors stored in an continuous array. The obvious advantage is 42 | // speed, where linear scanning of array is incredibly fast operation on modern CPUs. The second 43 | // advantage is simplicity. The disadvantage can be that it consumes still the same amount of 44 | // memory, no matter how used the device is. However the worst case memory usage is the best 45 | // possible because we don't store any additional data like in some more complex data structures 46 | // like trees. 47 | // 48 | // Nevertheless if the memory usage is a problem, slightly raising the sector size helps 49 | // tremendously. 4k sectors are norm today and if we have 1TB block device the map consumes 50 | // 1TB/4k*32 = 8GB. With 8k sectors it is just 4GB. This can be further reduced by shrinking data 51 | // types in SectorMetadata structure from int64 which is an overkill for most of them. 52 | // 53 | // This structure is serialized by gobs hence it has to be exported and all its attributes as well. 54 | type SectorMap struct { 55 | Sectors []SectorMetadata 56 | ObjUtilizations map[int64]int64 57 | DeadObjs map[int64]struct{} 58 | } 59 | 60 | // Returns new instance of the sector map. The map should not be used directly because it does not 61 | // support concurrent access. 62 | func New(length int64) *SectorMap { 63 | sectors := make([]SectorMetadata, length) 64 | objectUtilization := make(map[int64]int64) 65 | deadObjects := make(map[int64]struct{}) 66 | 67 | for i := range sectors { 68 | sectors[i].Key = notMappedKey 69 | } 70 | 71 | s := SectorMap{ 72 | Sectors: sectors, 73 | ObjUtilizations: objectUtilization, 74 | DeadObjs: deadObjects, 75 | } 76 | 77 | return &s 78 | } 79 | 80 | // Updates sectors in the map with new values from extents. startOfDataSectors 81 | // is the first sector with data in the object and key is the key of the 82 | // object. 83 | func (m *SectorMap) Update(extents []mapproxy.Extent, startOfDataSectors, key int64) { 84 | m.ObjUtilizations[key] = 0 85 | 86 | for _, e := range extents { 87 | m.updateExtent(e, startOfDataSectors, key) 88 | startOfDataSectors += e.Length 89 | } 90 | 91 | // Because of GC we can add object which will never update the map 92 | // because all write records are old 93 | if m.ObjUtilizations[key] == 0 { 94 | delete(m.ObjUtilizations, key) 95 | m.DeadObjs[key] = struct{}{} 96 | } 97 | } 98 | 99 | // Updates the information about objects utilizations for given sector. 100 | func (m *SectorMap) updateUtilization(key int64, s *SectorMetadata) { 101 | // Increment cannot be done at once because GC can 102 | // introduce object with writes with lower seqNo 103 | m.ObjUtilizations[key]++ 104 | if s.Key != notMappedKey { 105 | m.ObjUtilizations[s.Key]-- 106 | if m.ObjUtilizations[s.Key] == 0 { 107 | delete(m.ObjUtilizations, s.Key) 108 | m.DeadObjs[s.Key] = struct{}{} 109 | } 110 | } 111 | } 112 | 113 | // Update one sector. 114 | func (m *SectorMap) updateSector(key int64, s *SectorMetadata, targetSector int64, e mapproxy.Extent) { 115 | m.updateUtilization(key, s) 116 | 117 | s.Sector = targetSector 118 | s.Key = key 119 | s.SeqNo = e.SeqNo 120 | s.Flag = e.Flag 121 | } 122 | 123 | // Updates an extent. It checks whether the write is actually newer than write 124 | // already in the map. Like this we always keep the map consistent. 125 | func (m *SectorMap) updateExtent(e mapproxy.Extent, startOfDataSectors, key int64) { 126 | targetSector := startOfDataSectors 127 | for i := e.Sector; i < e.Sector+e.Length; i++ { 128 | s := &m.Sectors[i] 129 | if s.SeqNo <= e.SeqNo { // Equality because of GC 130 | m.updateSector(key, s, targetSector, e) 131 | } 132 | targetSector++ 133 | } 134 | } 135 | 136 | // Returns longest possible extent in the object starting at startSector with 137 | // maximal length length. This means that the extent has the same key and 138 | // sequential number. 139 | func (m *SectorMap) getExtent(startSector, length uint64) mapproxy.Extent { 140 | s := m.Sectors[startSector] 141 | e := mapproxy.Extent{ 142 | Sector: s.Sector, 143 | Length: 1, 144 | SeqNo: s.SeqNo, 145 | Flag: s.Flag, 146 | } 147 | 148 | for i := startSector + 1; ; i++ { 149 | if i >= uint64(len(m.Sectors)) || 150 | i >= startSector+length || 151 | m.Sectors[i].Key != m.Sectors[i-1].Key || 152 | m.Sectors[i].SeqNo != e.SeqNo || 153 | m.Sectors[i-1].Sector != m.Sectors[i].Sector-1 { 154 | 155 | break 156 | } 157 | 158 | e.Length++ 159 | } 160 | 161 | return e 162 | } 163 | 164 | // Returns all ObjectParts from which extent starting at sector with length 165 | // length can be reconstructed. 166 | func (m *SectorMap) Lookup(sector, length int64) []mapproxy.ObjectPart { 167 | parts := make([]mapproxy.ObjectPart, 0, typicalObjectPartsPerLookup) 168 | s := m.Sectors[sector].Sector 169 | l := int64(1) 170 | for i := int64(1); i < length; i++ { 171 | id := sector + i 172 | // The next sector is not from the same extent. Store part into 173 | // the returned value and begin new extent. 174 | if (m.Sectors[id].Key != m.Sectors[id-1].Key || 175 | m.Sectors[id].Sector != m.Sectors[id-1].Sector+1) && 176 | (m.Sectors[id].Key != -1 || m.Sectors[id-1].Key != notMappedKey) { 177 | 178 | parts = append(parts, mapproxy.ObjectPart{ 179 | Sector: s, 180 | Length: l, 181 | Key: m.Sectors[id-1].Key, 182 | }) 183 | s = m.Sectors[id].Sector 184 | l = 1 185 | } else { 186 | l++ 187 | } 188 | } 189 | parts = append(parts, mapproxy.ObjectPart{ 190 | Sector: s, 191 | Length: l, 192 | Key: m.Sectors[sector+length-1].Key, 193 | }) 194 | return parts 195 | } 196 | 197 | // Returns all extents and objectparts starting from sector with length length 198 | // that are stored in any of keys in keys. 199 | func (m *SectorMap) FindExtentsWithKeys(sector, length int64, keys map[int64]struct{}) []mapproxy.ExtentWithObjectPart { 200 | ci := make([]mapproxy.ExtentWithObjectPart, 0, typicalObjectPartsPerLookup) 201 | 202 | for i := sector; i < sector+length && i < int64(len(m.Sectors)); { 203 | key := m.Sectors[i].Key 204 | _, ok := keys[key] 205 | extent := m.getExtent(uint64(i), uint64(sector+length-i)) 206 | if ok { 207 | op := mapproxy.ObjectPart{ 208 | Sector: i, 209 | Length: 0, 210 | Key: key, 211 | } 212 | ci = append(ci, mapproxy.ExtentWithObjectPart{ 213 | Extent: extent, 214 | ObjectPart: op, 215 | }) 216 | } 217 | i += extent.Length 218 | } 219 | 220 | return ci 221 | } 222 | 223 | // Returns copy of deadObjects. These are objects with no valid data which can 224 | // be deleted. 225 | func (m *SectorMap) DeadObjects() map[int64]struct{} { 226 | deadObjects := make(map[int64]struct{}) 227 | 228 | for k := range m.DeadObjs { 229 | deadObjects[k] = struct{}{} 230 | } 231 | 232 | return deadObjects 233 | } 234 | 235 | // Returns the highest key from the map. 236 | func (m *SectorMap) GetMaxKey() int64 { 237 | var maxKey int64 238 | for k := range m.ObjUtilizations { 239 | if k > maxKey { 240 | maxKey = k 241 | } 242 | } 243 | 244 | return maxKey 245 | } 246 | 247 | // Return copy of the structure representing the object utilization. 248 | // Utilization is number of non-dead sectors. 249 | func (m *SectorMap) ObjectsUtilization() map[int64]int64 { 250 | objectUtilization := make(map[int64]int64) 251 | 252 | for k, v := range m.ObjUtilizations { 253 | objectUtilization[k] = v 254 | } 255 | 256 | return objectUtilization 257 | } 258 | 259 | // Returns serialized version of the map with go gobs. 260 | func (m *SectorMap) Serialize() []byte { 261 | var buf bytes.Buffer 262 | 263 | encoder := gob.NewEncoder(&buf) 264 | encoder.Encode(m) 265 | 266 | return buf.Bytes() 267 | } 268 | 269 | // Deserialized map from buf which was previously serialized by Serialize(). It 270 | // restored map and structures representing object utilization and dead 271 | // objects. During deserialization all sequential numbers are zeroed because 272 | // most they are not needed and most probably BUSE starts from 0 since it was 273 | // restarted. The map supports device size change. 274 | func (m *SectorMap) DeserializeAndReturnNextKey(buf []byte) int64 { 275 | // Size of the allocated map 276 | intendedSize := len(m.Sectors) 277 | 278 | // 1) In case of smaller checkpointed map, i.e. we enlarged the device, 279 | // the map would be shrinked and we need to resize it to its 280 | // intended size. 281 | // 2) In case of larger checkpointed map, i.e. we shrinked the device, 282 | // the map would be enlarged and we need to resize it to its inteded size. 283 | decoder := gob.NewDecoder(bytes.NewReader(buf)) 284 | decoder.Decode(m) 285 | 286 | if intendedSize < len(m.Sectors) { 287 | // Create new map with smaller size and copy the intended range 288 | // to it. Then replace the the map. We could just change the 289 | // len of the map, but then the memory would be still occupied 290 | // like in the case of larger map. 291 | sectors := make([]SectorMetadata, intendedSize) 292 | copy(sectors, m.Sectors) 293 | m.Sectors = sectors 294 | } else { 295 | // We already have allocated large map, but we decoded smaller 296 | // one and it the len was set according to the decoded 297 | // (smaller) map. We just change len to its full size. 298 | m.Sectors = m.Sectors[:cap(m.Sectors)] 299 | } 300 | 301 | var maxKey int64 = notMappedKey 302 | for _, s := range m.Sectors { 303 | if s.Key > maxKey { 304 | maxKey = s.Key 305 | } 306 | } 307 | 308 | for i := range m.Sectors { 309 | m.Sectors[i].SeqNo = 0 310 | } 311 | 312 | return maxKey + 1 313 | } 314 | 315 | // Deletes objects with keys from object utilizations. 316 | func (m *SectorMap) DeleteFromUtilization(keys map[int64]struct{}) { 317 | for k := range keys { 318 | delete(m.ObjUtilizations, k) 319 | } 320 | } 321 | 322 | // Deletes objects with keys from deadObjects from dead objects. 323 | func (m *SectorMap) DeleteFromDeadObjects(deadObjects map[int64]struct{}) { 324 | for k := range deadObjects { 325 | _, ok := m.DeadObjs[k] 326 | if ok { 327 | delete(m.DeadObjs, k) 328 | } 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /internal/bs3/objproxy/objproxy.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Package objproxy is a proxy for ObjectUploadDownloaderAt which performs 4 | // prioritization of various requests. 5 | package objproxy 6 | 7 | import ( 8 | "time" 9 | ) 10 | 11 | // Interface for s3 backend storage. Anything implementing this interface can 12 | // be used as a storage backend. 13 | type ObjectUploadDownloaderAt interface { 14 | // Uploads data in buf under the key identifier. 15 | Upload(key int64, buf []byte) error 16 | 17 | // Downloads data into buf starting from offset in the object 18 | // identified by key. The length of buf is the legth of requested data. 19 | DownloadAt(key int64, buf []byte, offset int64) error 20 | 21 | // Returns size in bytes of object identified by key. Needed only for 22 | // garbage collection and extent map recovery. Otherwise can have empty 23 | // implementation. 24 | GetObjectSize(key int64) (int64, error) 25 | 26 | // Deletes object identified by key and all successive objects. Needed 27 | // only for extent map restoration. Otherwise can have empty 28 | // implementation. 29 | DeleteKeyAndSuccessors(key int64) error 30 | } 31 | 32 | // Proxy for the backend storage which prioritizes requests. Requests coming to 33 | // the priority channels are handled first. Like this requests from low 34 | // priority operations like garbage collection do not slow down normal 35 | // operation. 36 | type ObjectProxy struct { 37 | Instance ObjectUploadDownloaderAt 38 | 39 | // Number of go routines to spawn for handling upload requests and 40 | // download requests. 41 | uploaders int 42 | downloaders int 43 | 44 | // Timeout after which low priority request can be served. 45 | idleTimeout time.Duration 46 | 47 | // Internal channels. 48 | uploads chan request 49 | downloads chan request 50 | uploadsPrio chan request 51 | downloadsPrio chan request 52 | } 53 | 54 | // Request is internal structure for wrapping the communication into channels. 55 | type request struct { 56 | key int64 57 | data []byte 58 | offset int64 59 | done chan error 60 | } 61 | 62 | // Return new instance of the proxy which can be directly used. It immediately 63 | // spawns go routines for upload and download workers. 64 | func New(storeInstance ObjectUploadDownloaderAt, uploaders, downloaders int, 65 | idleTimeout time.Duration) ObjectProxy { 66 | 67 | uploads := make(chan request) 68 | downloads := make(chan request) 69 | uploadsPrio := make(chan request) 70 | downloadsPrio := make(chan request) 71 | 72 | s := ObjectProxy{ 73 | Instance: storeInstance, 74 | uploaders: uploaders, 75 | downloaders: downloaders, 76 | idleTimeout: idleTimeout, 77 | uploads: uploads, 78 | downloads: downloads, 79 | uploadsPrio: uploadsPrio, 80 | downloadsPrio: downloadsPrio, 81 | } 82 | 83 | for i := 0; i < s.uploaders; i++ { 84 | go s.uploadWorker() 85 | } 86 | 87 | for i := 0; i < s.downloaders; i++ { 88 | go s.downloadWorker() 89 | } 90 | 91 | return s 92 | } 93 | 94 | // Proxy function for uploading the object with key. It selects the right 95 | // channel according to prio and waits for reply. 96 | func (p *ObjectProxy) Upload(key int64, body []byte, prio bool) error { 97 | c := p.uploads 98 | if prio { 99 | c = p.uploadsPrio 100 | } 101 | 102 | done := make(chan error) 103 | c <- request{key: key, data: body, done: done} 104 | return <-done 105 | } 106 | 107 | // Proxy function for downloading the object with key. It selects the right 108 | // channel according to prio and waits for reply. 109 | func (p *ObjectProxy) Download(key int64, chunk []byte, offset int64, prio bool) error { 110 | c := p.downloads 111 | if prio { 112 | c = p.downloadsPrio 113 | } 114 | 115 | done := make(chan error) 116 | c <- request{key, chunk, offset, done} 117 | return <-done 118 | } 119 | 120 | // Generic function for prioritization used by both, uploader and downloader workers. 121 | func (p *ObjectProxy) receiveRequest(prio chan request, normal chan request) request { 122 | var r request 123 | 124 | select { 125 | case r = <-prio: 126 | //case <-time.NewTicker(p.idleTimeout).C: 127 | default: 128 | select { 129 | case r = <-prio: 130 | case r = <-normal: 131 | } 132 | } 133 | 134 | return r 135 | } 136 | 137 | // Upload worker just calls Upload() on the instance provided in New(). 138 | func (p *ObjectProxy) uploadWorker() { 139 | for { 140 | r := p.receiveRequest(p.uploadsPrio, p.uploads) 141 | err := p.Instance.Upload(r.key, r.data) 142 | r.done <- err 143 | } 144 | } 145 | 146 | // Upload worker just calls Download() on the instance provided in New(). 147 | func (p *ObjectProxy) downloadWorker() { 148 | for { 149 | r := p.receiveRequest(p.downloadsPrio, p.downloads) 150 | err := p.Instance.DownloadAt(r.key, r.data, r.offset) 151 | r.done <- err 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /internal/bs3/objproxy/s3/s3.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Package s3 implements wrapping functions to satisfy ObjectUploadDownloaderAt 4 | // interface. It uses aws api v1. 5 | package s3 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "net" 11 | "net/http" 12 | "time" 13 | 14 | "github.com/aws/aws-sdk-go/aws" 15 | "github.com/aws/aws-sdk-go/aws/credentials" 16 | "github.com/aws/aws-sdk-go/aws/request" 17 | "github.com/aws/aws-sdk-go/aws/session" 18 | "github.com/aws/aws-sdk-go/service/s3" 19 | "github.com/aws/aws-sdk-go/service/s3/s3manager" 20 | "golang.org/x/net/http2" 21 | ) 22 | 23 | const ( 24 | // Format string for the object key. There should be no need to set 25 | // this differently, hence the constant. If you want to change it, keep 26 | // in mind that we rely on the continuous space of keys for prefix 27 | // consistecy as well as in the GC process. 28 | // 29 | // Furthermore we split the key into halves and use the lower half of 30 | // bits as s3 prefix and upper half for the object key. This is to 31 | // prevent s3 rate limiting which is applied to objects with the same 32 | // prefix. 33 | keyFmt = "%08x/%08x" 34 | ) 35 | 36 | // Implementation of ObjectUploadDownloaderAt using AWS S3 as a backend. 37 | // Parameters of http connection are carefully tuned for the best performance 38 | // in the AWS environment. 39 | type S3 struct { 40 | uploader *s3manager.Uploader 41 | downloader *s3manager.Downloader 42 | client *s3.S3 43 | bucket string 44 | } 45 | 46 | // Options to use in New() function due to high number of parameters. There is 47 | // lower chance of ordering mistake with named parameters. 48 | type Options struct { 49 | Remote string 50 | Region string 51 | Bucket string 52 | AccessKey string 53 | SecretKey string 54 | PartSize int64 55 | } 56 | 57 | // Helper struct used for tuning the http connection. 58 | type httpClientSettings struct { 59 | connect time.Duration 60 | connKeepAlive time.Duration 61 | expectContinue time.Duration 62 | idleConn time.Duration 63 | maxAllIdleConns int 64 | maxHostIdleConns int 65 | responseHeader time.Duration 66 | tlsHandshake time.Duration 67 | } 68 | 69 | // Returns http client with configured parameters and added https2 support. 70 | func newHTTPClientWithSettings(httpSettings httpClientSettings) *http.Client { 71 | tr := &http.Transport{ 72 | ResponseHeaderTimeout: httpSettings.responseHeader, 73 | Proxy: http.ProxyFromEnvironment, 74 | DialContext: (&net.Dialer{ 75 | KeepAlive: httpSettings.connKeepAlive, 76 | DualStack: true, 77 | Timeout: httpSettings.connect, 78 | }).DialContext, 79 | MaxIdleConns: httpSettings.maxAllIdleConns, 80 | IdleConnTimeout: httpSettings.idleConn, 81 | TLSHandshakeTimeout: httpSettings.tlsHandshake, 82 | MaxIdleConnsPerHost: httpSettings.maxHostIdleConns, 83 | ExpectContinueTimeout: httpSettings.expectContinue, 84 | } 85 | 86 | http2.ConfigureTransport(tr) 87 | 88 | return &http.Client{ 89 | Transport: tr, 90 | } 91 | } 92 | 93 | // Upload function implemented through s3 api. 94 | func (s *S3) Upload(key int64, buf []byte) error { 95 | _, err := s.uploader.Upload(&s3manager.UploadInput{ 96 | Bucket: aws.String(s.bucket), 97 | Key: aws.String(encode(key)), 98 | Body: bytes.NewReader(buf), 99 | }) 100 | 101 | return err 102 | } 103 | 104 | // GetObjectSize function implemented through s3 api. 105 | func (s *S3) GetObjectSize(key int64) (int64, error) { 106 | head, err := s.client.HeadObject(&s3.HeadObjectInput{ 107 | Bucket: aws.String(s.bucket), 108 | Key: aws.String(encode(key)), 109 | }) 110 | 111 | var size int64 112 | if err == nil { 113 | size = *head.ContentLength 114 | } 115 | 116 | return size, err 117 | } 118 | 119 | // DownloadAt function implemented through s3 api. 120 | func (s *S3) DownloadAt(key int64, buf []byte, offset int64) error { 121 | to := offset + int64(len(buf)) - 1 122 | rng := fmt.Sprintf("bytes=%d-%d", offset, to) 123 | b := aws.NewWriteAtBuffer(buf) 124 | 125 | _, err := s.downloader.Download(b, &s3.GetObjectInput{ 126 | Bucket: aws.String(s.bucket), 127 | Key: aws.String(encode(key)), 128 | Range: &rng, 129 | }) 130 | 131 | return err 132 | } 133 | 134 | // Delete function implemented through s3 api. 135 | func (s *S3) Delete(key int64) error { 136 | _, err := s.client.DeleteObject(&s3.DeleteObjectInput{ 137 | Bucket: aws.String(s.bucket), 138 | Key: aws.String(encode(key)), 139 | }) 140 | 141 | return err 142 | } 143 | 144 | func New(o Options) (*S3, error) { 145 | s := new(S3) 146 | s.bucket = o.Bucket 147 | 148 | // For the best possible performance (throughput close to 10GB/s) it 149 | // should be tuned according to the object backend. 150 | // Following settings are recommended by AWS for usage in their 151 | // network. 152 | httpClient := newHTTPClientWithSettings(httpClientSettings{ 153 | connect: 5 * time.Second, 154 | expectContinue: 1 * time.Second, 155 | idleConn: 90 * time.Second, 156 | connKeepAlive: 30 * time.Second, 157 | maxAllIdleConns: 100, 158 | maxHostIdleConns: 10, 159 | responseHeader: 5 * time.Second, 160 | tlsHandshake: 5 * time.Second, 161 | }) 162 | 163 | sess, err := session.NewSession(&aws.Config{ 164 | Endpoint: aws.String(o.Remote), 165 | Region: aws.String(o.Region), 166 | Credentials: credentials.NewStaticCredentials(o.AccessKey, o.SecretKey, ""), 167 | S3ForcePathStyle: aws.Bool(true), 168 | S3DisableContentMD5Validation: aws.Bool(true), 169 | HTTPClient: httpClient, 170 | }) 171 | 172 | if err != nil { 173 | return nil, err 174 | } 175 | 176 | s.client = s3.New(sess) 177 | s.uploader = s3manager.NewUploader(sess) 178 | s.downloader = s3manager.NewDownloader(sess) 179 | 180 | // Limiting the concurency of s3 library. We do not benefit from 181 | // multipart uploads/downloads because we have small objects. The only 182 | // exception is downloading/uploading the extent map during initial 183 | // recover or final map upload. This should be tuned if your map is 184 | // huge (= huge device) and you have fast network and don't want to 185 | // wait. 186 | s.uploader.Concurrency = 1 187 | s3manager.WithUploaderRequestOptions(request.Option(func(r *request.Request) { 188 | r.HTTPRequest.Header.Add("X-Amz-Content-Sha256", "UNSIGNED-PAYLOAD") 189 | }))(s.uploader) 190 | s.downloader.Concurrency = 1 191 | 192 | err = s.makeBucketExist() 193 | 194 | return s, err 195 | } 196 | 197 | // Check whether bucket exist and if not, create it and wait until it appears. 198 | func (s *S3) makeBucketExist() error { 199 | _, err := s.client.HeadBucket(&s3.HeadBucketInput{Bucket: aws.String(s.bucket)}) 200 | 201 | if err != nil { 202 | _, err = s.client.CreateBucket(&s3.CreateBucketInput{ 203 | Bucket: aws.String(s.bucket)}) 204 | 205 | if err == nil { 206 | err = s.client.WaitUntilBucketExists(&s3.HeadBucketInput{ 207 | Bucket: aws.String(s.bucket)}) 208 | } 209 | } 210 | 211 | return err 212 | } 213 | 214 | // Delete object with key and all objects with higher keys. 215 | func (s *S3) DeleteKeyAndSuccessors(fromKey int64) error { 216 | err := s.client.ListObjectsV2Pages(&s3.ListObjectsV2Input{ 217 | Bucket: aws.String(s.bucket), 218 | }, func(page *s3.ListObjectsV2Output, last bool) bool { 219 | for _, o := range page.Contents { 220 | key := decode(*o.Key) 221 | if key >= fromKey { 222 | s.Delete(key) 223 | } 224 | } 225 | return true 226 | }) 227 | 228 | return err 229 | } 230 | 231 | // We split the key into halves and use the lower half of bits as s3 prefix and 232 | // upper half for the object key. This is to prevent s3 rate limiting which is 233 | // applied to objects with the same prefix. 234 | func encode(key int64) string { 235 | left := (key >> 32) & 0xffffffff 236 | right := key & 0xffffffff 237 | 238 | return fmt.Sprintf(keyFmt, right, left) 239 | } 240 | 241 | // The inverse to encode() 242 | func decode(keyWithPrefix string) int64 { 243 | var prefix, key int64 244 | fmt.Sscanf(keyWithPrefix, keyFmt, &prefix, &key) 245 | 246 | k := (key << 32) + prefix 247 | 248 | return k 249 | } 250 | -------------------------------------------------------------------------------- /internal/config/config.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Package config is a singleton and provides global access to the 4 | // configuration values. 5 | package config 6 | 7 | import ( 8 | "flag" 9 | "os" 10 | 11 | "github.com/ilyakaznacheev/cleanenv" 12 | ) 13 | 14 | const ( 15 | // Default config path. It does not need to exist, default values for all parameters will be 16 | // used instead. 17 | defaultConfig = "/etc/bs3/config.toml" 18 | ) 19 | 20 | var Cfg Config 21 | 22 | // Configuration structure for the program. We use toml format for file-based 23 | // configuration and also all configuration options can be overriden by 24 | // environment variable specified in this structure. 25 | type Config struct { 26 | ConfigPath string 27 | 28 | Null bool `toml:"null" env:"BS3_NULL" env-default:"false" env-description:"Use null backend, i.e. immediate acknowledge to read or write. For testing BUSE raw performance."` 29 | Major int `toml:"major" env:"BS3_MAJOR" env-default:"0" env-description:"Device major. Decimal part of /dev/buse%d."` 30 | Threads int `toml:"threads" env:"BS3_THREADS" env-default:"0" env-description:"Number of user-space threads for serving queues."` 31 | CPUsPerNode int `toml:"cpus_per_node" env:"BS3_CPUS_PER_NODE" env-default:"0" env-description:"Number of CPUs per one numa node."` 32 | Size int64 `toml:"size" env:"BS3_SIZE" env-default:"8" env-description:"Device size in GB."` 33 | BlockSize int `toml:"block_size" env:"BS3_BLOCKSIZE" env-default:"4096" env-description:"Block size."` 34 | IOMin int `toml:"io_min" env:"BS3_IO_MIN" env-default:"0" env-description:"Minimal IO."` 35 | IOOpt int `toml:"io_opt" env:"BS3_IO_OPT" env-default:"0" env-description:"Optimal IO."` 36 | Scheduler bool `toml:"scheduler" env:"BS3_SCHEDULER" env-default:"false" env-description:"Use block layer scheduler."` 37 | QueueDepth int `toml:"queue_depth" env:"BS3_QUEUEDEPTH" env-default:"128" env-description:"Device IO queue depth."` 38 | 39 | S3 struct { 40 | Bucket string `toml:"bucket" env:"BS3_S3_BUCKET" env-description:"S3 Bucket name." env-default:"bs3"` 41 | Remote string `toml:"remote" env:"BS3_S3_REMOTE" env-description:"S3 Remote address. Empty string for AWS S3 endpoint." env-default:""` 42 | Region string `toml:"region" env:"BS3_S3_REGION" env-description:"S3 Region." env-default:"us-east-1"` 43 | AccessKey string `toml:"access_key" env:"BS3_S3_ACCESSKEY" env-description:"S3 Access Key." env-default:""` 44 | SecretKey string `toml:"secret_key" env:"BS3_S3_SECRETKEY" env-description:"S3 Secret Key." env-default:""` 45 | Uploaders int `toml:"uploaders" env:"BS3_S3_UPLOADERS" env-description:"S3 Max number of uploader threads." env-default:"16"` 46 | Downloaders int `toml:"downloaders" env:"BS3_S3_DOWNLOADERS" env-description:"S3 Max number of downloader threads." env-default:"16"` 47 | } `toml:"s3"` 48 | 49 | Write struct { 50 | Durable bool `toml:"durable" env:"BS3_WRITE_DURABLE" env-description:"Flush semantics. True means durable, false means barrier only." env-default:"false"` 51 | BufSize int `toml:"shared_buffer_size" env:"BS3_WRITE_BUFSIZE" env-description:"Write shared memory size in MB." env-default:"32"` 52 | ChunkSize int `toml:"chunk_size" env:"BS3_WRITE_CHUNKSIZE" env-description:"Chunk size in MB." env-default:"4"` 53 | CollisionSize int `toml:"collision_chunk_size" env:"BS3_WRITE_COLSIZE" env-description:"Collision size in MB." env-default:"1"` 54 | } `toml:"write"` 55 | 56 | Read struct { 57 | BufSize int `toml:"shared_buffer_size" env:"BS3_READ_BUFSIZE" env-description:"Read shared memory size in MB." env-default:"32"` 58 | } `toml:"read"` 59 | 60 | GC struct { 61 | Step int64 `toml:"step" env:"BS3_GC_STEP" env-description:"Step for traversing the extent map for living extents. In blocks." env-default:"1024"` 62 | LiveData float64 `toml:"live_data" env:"BS3_GC_LIVEDATA" env-description:"Live data ratio threshold for threshold GC. This is for the threshold GC which is triggered by the user or systemd timer." env-default:"0.3"` 63 | IdleTimeoutMs int64 `toml:"idle_timeout" env:"BS3_GC_IDLETIMEOUT" env-description:"Idle timeout for running GC requests. In ms." env-default:"200"` 64 | Wait int64 `toml:"wait" env:"BS3_GC_WAIT" env-description:"How many seconds wait before next dead GC round. This just for cleaning dead objects with minimal performance impact." env-default:"600"` 65 | } `toml:"gc"` 66 | 67 | Log struct { 68 | Level int `toml:"level" env:"BS3_LOG_LEVEL" env-description:"Log level." env-default:"-1"` 69 | Pretty bool `toml:"pretty" env:"BS3_LOG_PRETTY" env-description:"Pretty logging." env-default:"true"` 70 | } `toml:"log"` 71 | 72 | SkipCheckpoint bool `toml:"skip_checkpoint" env:"BS3_SKIP" env-description:"Skip restoring from and creating checkpoint." env-default:"false"` 73 | Profiler bool `toml:"profiler" env:"BS3_PROFILER" env-description:"Enable golang web profiler." env-default:"false"` 74 | ProfilerPort int `toml:"profiler_port" env:"BS3_PROFILER_PORT" env-description:"Port to listen on." env-default:"6060"` 75 | } 76 | 77 | // Configure reads commandline flags and handles the configuration. The 78 | // configuration file has the lower priotiry and the environment variables have 79 | // the highest priority. It is perfetcly to fine to use just one of these or to 80 | // combine them. 81 | func Configure() error { 82 | flagSetup() 83 | err := parse() 84 | 85 | return err 86 | } 87 | 88 | // Parse the configuration file and reads the environment variable. After that 89 | // it does some values postprocessing and fills the Cfg structure. 90 | func parse() error { 91 | if err := cleanenv.ReadConfig(Cfg.ConfigPath, &Cfg); err != nil { 92 | if err := cleanenv.ReadEnv(&Cfg); err != nil { 93 | return err 94 | } 95 | } 96 | 97 | Cfg.Size *= 1024 * 1024 * 1024 98 | Cfg.Write.BufSize *= 1024 * 1024 99 | Cfg.Write.ChunkSize *= 1024 * 1024 100 | Cfg.Write.CollisionSize *= 1024 * 1024 101 | Cfg.Read.BufSize *= 1024 * 1024 102 | 103 | if Cfg.BlockSize != 512 { 104 | Cfg.BlockSize = 4096 105 | } 106 | 107 | if Cfg.IOMin == 0 { 108 | Cfg.IOMin = Cfg.BlockSize 109 | } 110 | 111 | if Cfg.IOOpt == 0 { 112 | Cfg.IOOpt = Cfg.BlockSize 113 | } 114 | 115 | return nil 116 | } 117 | 118 | // Handle program flags. 119 | func flagSetup() { 120 | f := flag.NewFlagSet("bs3", flag.ExitOnError) 121 | f.StringVar(&Cfg.ConfigPath, "c", defaultConfig, "Path to configuration file") 122 | f.Usage = cleanenv.FUsage(f.Output(), &Cfg, nil, f.Usage) 123 | f.Parse(os.Args[1:]) 124 | } 125 | -------------------------------------------------------------------------------- /internal/null/null.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // Null package does nothing but correctly. 4 | package null 5 | 6 | // Null implementation of BuseReadWriter. Usefull for measuring performance of 7 | // underlying BUSE and buse library. Otherwise useless. Is contained in the 8 | // same module to avoid duplication in BUSE code and configuration. It can also 9 | // serve as a template for new BUSE device implementation since it is an 10 | // implementation of BuseReadWriter interface. 11 | type null struct { 12 | } 13 | 14 | func NewNull() *null { 15 | return &null{} 16 | } 17 | 18 | func (n *null) BuseWrite(writes int64, chunk []byte) error { 19 | return nil 20 | } 21 | 22 | func (n *null) BuseRead(sector, length int64, chunk []byte) error { 23 | return nil 24 | } 25 | 26 | func (n *null) BusePreRun() { 27 | } 28 | 29 | func (n *null) BusePostRemove() { 30 | } 31 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2021 Vojtech Aschenbrenner 2 | 3 | // bs3 is a userspace daemon using BUSE for creating a block device and S3 4 | // protocol to communicate with object backend. It is designed for easy 5 | // extension of all the important parts. Hence the S3 protocol can be easily 6 | // replaced by RADOS or any other protocol. 7 | // 8 | // Project structure is following: 9 | // 10 | // - internal contains all packages used by this program. The name "internal" 11 | // is reserved by go compiler and disallows its imports from different 12 | // projects. Since we don't provide any reusable packages, we use internal 13 | // directory. 14 | // 15 | // - internal/bs3 contains all packages related only to the bs3 implementation. 16 | // See the package descriptions in the source code for more details. 17 | // 18 | // - internal/null contains trivial implementation of block device which does 19 | // nothing but correctly. It can be used for benchmarking underlying buse 20 | // library and kernel module. The null implementation is part of bs3 because it 21 | // shares configuration and makes benchmarking easier and without code 22 | // duplication. 23 | // 24 | // - internal/config contains configuration package which is common for both, 25 | // bs3 and null implementations. 26 | package main 27 | 28 | import ( 29 | "fmt" 30 | "net/http" 31 | _ "net/http/pprof" 32 | "os" 33 | "os/signal" 34 | "syscall" 35 | 36 | "github.com/rs/zerolog" 37 | "github.com/rs/zerolog/log" 38 | 39 | "github.com/asch/bs3/internal/bs3" 40 | "github.com/asch/bs3/internal/config" 41 | "github.com/asch/bs3/internal/null" 42 | "github.com/asch/buse/lib/go/buse" 43 | ) 44 | 45 | // Parse configuration from file and environment variables, creates a 46 | // BuseReadWriter and creates new buse device with it. The device is ran until 47 | // it is signaled by SIGINT or SIGTERM to gracefully finish. 48 | func main() { 49 | err := config.Configure() 50 | if err != nil { 51 | log.Panic().Err(err).Send() 52 | } 53 | 54 | loggerSetup(config.Cfg.Log.Pretty, config.Cfg.Log.Level) 55 | 56 | log.Info().Msgf("Configuration for block device buse%d loaded from %s", 57 | config.Cfg.Major, config.Cfg.ConfigPath) 58 | 59 | if config.Cfg.Profiler { 60 | log.Info().Msg("Running profiler.") 61 | runProfiler(config.Cfg.ProfilerPort) 62 | } 63 | 64 | buseReadWriter, err := getBuseReadWriter(config.Cfg.Null) 65 | if err != nil { 66 | log.Panic().Err(err).Send() 67 | } 68 | 69 | buse, err := buse.New(buseReadWriter, buse.Options{ 70 | Durable: config.Cfg.Write.Durable, 71 | WriteChunkSize: int64(config.Cfg.Write.ChunkSize), 72 | BlockSize: int64(config.Cfg.BlockSize), 73 | IOMin: int64(config.Cfg.IOMin), 74 | Threads: int(config.Cfg.Threads), 75 | Major: int64(config.Cfg.Major), 76 | WriteShmSize: int64(config.Cfg.Write.BufSize), 77 | ReadShmSize: int64(config.Cfg.Read.BufSize), 78 | Size: int64(config.Cfg.Size), 79 | CollisionArea: int64(config.Cfg.Write.CollisionSize), 80 | QueueDepth: int64(config.Cfg.QueueDepth), 81 | Scheduler: config.Cfg.Scheduler, 82 | CPUsPerNode: config.Cfg.CPUsPerNode, 83 | }) 84 | 85 | if err != nil { 86 | log.Panic().Msg(err.Error()) 87 | } 88 | log.Info().Msgf("Block device buse%d registered.", config.Cfg.Major) 89 | 90 | registerSigHandlers(buse) 91 | 92 | buse.Run() 93 | log.Info().Msgf("Block device buse%d stopped.", config.Cfg.Major) 94 | 95 | buse.RemoveDevice() 96 | log.Info().Msgf("Block device buse%d removed.", config.Cfg.Major) 97 | } 98 | 99 | // Return null device if user wants it, otherwise returns bs3 device, which is 100 | // default. 101 | func getBuseReadWriter(wantNullDevice bool) (buse.BuseReadWriter, error) { 102 | if wantNullDevice { 103 | return null.NewNull(), nil 104 | } 105 | 106 | bs3, err := bs3.NewWithDefaults() 107 | 108 | return bs3, err 109 | } 110 | 111 | // Register handler for graceful stop when SIGINT or SIGTERM came in. 112 | func registerSigHandlers(buse buse.Buse) { 113 | stopChan := make(chan os.Signal, 1) 114 | signal.Notify(stopChan, os.Interrupt) 115 | signal.Notify(stopChan, syscall.SIGTERM) 116 | go func() { 117 | <-stopChan 118 | log.Info().Msg("Stopping bs3 device.") 119 | buse.StopDevice() 120 | }() 121 | } 122 | 123 | func loggerSetup(pretty bool, level int) { 124 | if pretty { 125 | log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) 126 | } 127 | 128 | zerolog.SetGlobalLevel(zerolog.Level(level)) 129 | } 130 | 131 | // Enables remote profiling support. Useful for perfomance debugging. 132 | func runProfiler(port int) { 133 | go func() { 134 | log.Info().Err(http.ListenAndServe(fmt.Sprintf("localhost:%d", port), nil)).Send() 135 | }() 136 | } 137 | --------------------------------------------------------------------------------