├── Makefile
├── README.md
├── bs3.png
├── config.toml
├── contrib
└── systemd
│ ├── bs3-gc.service
│ ├── bs3-gc.timer
│ └── bs3.service
├── go.mod
├── go.sum
├── internal
├── bs3
│ ├── bs3.go
│ ├── doc.go
│ ├── gc.go
│ ├── key
│ │ └── key.go
│ ├── mapproxy
│ │ ├── mapproxy.go
│ │ └── sectormap
│ │ │ └── sectormap.go
│ └── objproxy
│ │ ├── objproxy.go
│ │ └── s3
│ │ └── s3.go
├── config
│ └── config.go
└── null
│ └── null.go
└── main.go
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY = install fmt tidy clean
2 |
3 | SOURCES := $(shell find . -name "*.go")
4 | SYSTEMD_UNITS := $(wildcard contrib/systemd/*)
5 | SYSTEMD_PATH := /etc/systemd/system
6 | SYSTEMD_CONTRIB_PATH := contrib/systemd
7 |
8 | bs3: $(SOURCES)
9 | go build
10 |
11 | install: bs3 $(SYSTEMD_UNITS)
12 | install -D bs3 /usr/local/bin/bs3
13 | install -D -m 600 config.toml /etc/bs3/config.toml
14 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3.service $(SYSTEMD_PATH)/bs3.service
15 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3-gc.service $(SYSTEMD_PATH)/bs3-gc.service
16 | install -D -m 644 $(SYSTEMD_CONTRIB_PATH)/bs3-gc.timer $(SYSTEMD_PATH)/bs3-gc.timer
17 |
18 | fmt:
19 | go fmt ./...
20 |
21 | tidy:
22 | go mod tidy
23 |
24 | clean:
25 | rm -f bs3
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BS3: Block Device in S3
2 |
3 | ## Write Performance Comparison
4 |
5 |
6 |
7 | ## Requirements
8 |
9 | * BUSE
10 | * GNU Make
11 | * Go 1.16 or newer
12 |
13 | ## Installation
14 |
15 | ```
16 | make
17 | sudo make install
18 | ```
19 |
20 | ## Usage
21 |
22 | ```
23 | # Edit /etc/bs3/config.toml first
24 |
25 | systemctl start bs3
26 | systemctl status bs3
27 | systemctl stop bs3
28 | ```
29 |
--------------------------------------------------------------------------------
/bs3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asch/bs3/f1a0c34ff3511eadbdb57c9e7c94d743d24366e7/bs3.png
--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
1 | # Specify the major of the corresponding buse device you want to configure and
2 | # connect to. E.g. 0 if you want to work with /dev/buse0.
3 | major = 0
4 |
5 | # Number of user-space daemon threads which is also a maximal number of queues
6 | # storage stack uses. This is limited to the number of CPUs. I.e. minimal value
7 | # is 1 and maximal is number of CPUs. Optimally it should be set to the number
8 | # of CPUs. 0 means optimal value.
9 | threads = 0
10 |
11 | # Size of the created block device in GB.
12 | size = 8 #GB
13 |
14 | # Block size of created device. 512 or 4096. It is forbidden to change
15 | # block_size on the existent block device. In B.
16 | block_size = 4096 #B
17 |
18 | # Minimal IO size in Bytes. Useful for userspace raid configuration working
19 | # with chunks larger than block size. E.g. mdraid uses 512KB chunks size.
20 | io_min = 0
21 |
22 | # Optimal IO size in Bytes. Useful for userspace raid configuration working
23 | # with chunks larger than block size. E.g. mdraid uses 512KB chunks size.
24 | io_opt = 0
25 |
26 | # Whether IOs should be scheduled by linux kernel stack.
27 | scheduler = false
28 |
29 | # IO queue depth for created block device.
30 | queue_depth = 256
31 |
32 | # Use null backend, i.e. just immediately acknowledge reads and writes and drop
33 | # them. Useful for testing raw BUSE performance. Otherwise useless because all
34 | # data are lost.
35 | null = false
36 |
37 | # Enable web-based go pprof profiler for performance profiling.
38 | profiler = false
39 |
40 | # Profiler port.
41 | profiler_port = 6060
42 |
43 | # Configuration related to AWS S3
44 | [s3]
45 | # AWS Access Key
46 | access_key = "Server-Access-Key"
47 |
48 | # AWS Secret Key
49 | secret_key = "Server-Secret-Key"
50 |
51 | # Bucket where to store objects.
52 | bucket = "bs3"
53 |
54 | # ://: of the S3 backend. AWS S3 endpoint is used when empty string.
55 | remote = ""
56 |
57 | # Region to use.
58 | region = "us-east-1"
59 |
60 | # Max number of threads to spawn for uploads and downloads.
61 | uploaders = 384
62 | downloaders = 384
63 |
64 | # Configuration specific to write path.
65 | [write]
66 | # Semantics of the flush request. True means durable device, i.e. flush request
67 | # gets acknowledge when data are persisted on the backend. False means
68 | # eventually durable, i.e. flush request just a barrier.
69 | durable = false
70 |
71 | # Size of the shared memory between kernel and user space for data being
72 | # written. The size is per one thread. In MB.
73 | shared_buffer_size = 32 #MB
74 |
75 | # Size of the chunk. Chunk is the smallest piece which can be sent to the user
76 | # space and where all writes are stored. In MB.
77 | chunk_size = 4 #MB
78 |
79 | # The whole address space is divided into collision domains. Every collision
80 | # domain has its own counter for writes' sequential numbers. This is useful
81 | # when we don't want to have one shared counter for writes. Instead we split it
82 | # into parts and save the cache coherency protocol traffic. In MB.
83 | collision_chunk_size = 1 #MB
84 |
85 | # Configuration specific to read path.
86 | [read]
87 |
88 | # Size of the shared memory between kernel and user space for data being read.
89 | # The size is per one thread. In MB.
90 | shared_buffer_size = 32 #MB
91 |
92 | # Garbage Collection related configuration
93 | [gc]
94 | # Step when scanning the extent map. In blocks.
95 | step = 1024
96 |
97 | # Threshold for live data in the object. Objects under this threshold are
98 | # garbage collected by the "threshold GC" which is trigerred by SIGUSR1. This
99 | # type of GC is heavy on resources and should be planned by the timer for not
100 | # intense times.
101 | live_data = 0.3
102 |
103 | # Timeout to wait before any of requests from GC thread will be served by the
104 | # extent map and object manager. In ms.
105 | idle_timeout = 200
106 |
107 | # How many seconds to wait before next periodic GC round. This is related to
108 | # "dead GC" cleaning just dead objects. It very light on resources and does not
109 | # contend for the extent map like the "threshold GC".
110 | wait = 600
111 |
112 | # Configuration specific to the logger.
113 | [log]
114 | # Minimal level of logged messages. Following levels are provided:
115 | # panic 5, fatal 4, error 3, warn 2, info 1, debug 0, trace -1
116 | level = -1
117 |
118 | # Pretty print means nicer log output for human but much slower than non-pretty
119 | # json output.
120 | pretty = true
121 |
--------------------------------------------------------------------------------
/contrib/systemd/bs3-gc.service:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | [Unit]
4 | Description=Run threshold GC for block device in s3
5 |
6 | [Service]
7 | Type=simple
8 | ExecStart=pkill -USR1 -f 'bs3 -c /etc/bs3/config.toml'
9 |
--------------------------------------------------------------------------------
/contrib/systemd/bs3-gc.timer:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | [Unit]
4 | Description=Run threshold GC for block device in s3 every week
5 |
6 | [Timer]
7 | OnCalendar=weekly
8 |
9 | [Install]
10 | WantedBy=multi-user.target
11 |
--------------------------------------------------------------------------------
/contrib/systemd/bs3.service:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | [Unit]
4 | Description=Block device in s3
5 | After=local-fs.target network-online.target
6 |
7 | [Service]
8 | Type=simple
9 | ExecStart=bs3 -c /etc/bs3/config.toml
10 | KillMode=mixed
11 |
12 | [Install]
13 | WantedBy=multi-user.target
14 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/asch/bs3
2 |
3 | go 1.16
4 |
5 | require (
6 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9
7 | github.com/aws/aws-sdk-go v1.38.60
8 | github.com/ilyakaznacheev/cleanenv v1.2.5
9 | github.com/rs/zerolog v1.22.0
10 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b
11 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f // indirect
12 | )
13 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
3 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9 h1:suBdWCu2BxNxC6YETUuFnYanAnffGtBBQ060v3rO4/A=
4 | github.com/asch/buse/lib/go/buse v0.0.0-20220419090641-f12ccb1d15a9/go.mod h1:dxWl+7wjthiJ2JB8vNGTMy1FW7W8o2khYiGnYitxgms=
5 | github.com/aws/aws-sdk-go v1.38.60 h1:MgyEsX0IMwivwth1VwEnesBpH0vxbjp5a0w1lurMOXY=
6 | github.com/aws/aws-sdk-go v1.38.60/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
7 | github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
8 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
9 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
10 | github.com/ilyakaznacheev/cleanenv v1.2.5 h1:/SlcF9GaIvefWqFJzsccGG/NJdoaAwb7Mm7ImzhO3DM=
11 | github.com/ilyakaznacheev/cleanenv v1.2.5/go.mod h1:/i3yhzwZ3s7hacNERGFwvlhwXMDcaqwIzmayEhbRplk=
12 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
13 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
14 | github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
15 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
16 | github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc=
17 | github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg=
18 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
19 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
20 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
21 | github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
22 | github.com/rs/zerolog v1.22.0 h1:XrVUjV4K+izZpKXZHlPrYQiDtmdGiCylnT4i43AAWxg=
23 | github.com/rs/zerolog v1.22.0/go.mod h1:ZPhntP/xmq1nnND05hhpAh2QMhSsA4UN3MGZ6O2J3hM=
24 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
25 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
26 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
27 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
28 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
29 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
30 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
31 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
32 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
33 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
34 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b h1:k+E048sYJHyVnsr1GDrRZWQ32D2C7lWs9JRc0bel53A=
35 | golang.org/x/net v0.0.0-20210610132358-84b48f89b13b/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
36 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
37 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
38 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
39 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
40 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
41 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
42 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
43 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
44 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f h1:rlezHXNlxYWvBCzNses9Dlc7nGFaNMJeqLolcmQSSZY=
45 | golang.org/x/sys v0.0.0-20220330033206-e17cdc41300f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
47 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
48 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
49 | golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
50 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
51 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
52 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
53 | golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
54 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
55 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
56 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
57 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
58 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
59 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
60 | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
61 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
62 | olympos.io/encoding/edn v0.0.0-20200308123125-93e3b8dd0e24 h1:sreVOrDp0/ezb0CHKVek/l7YwpxPJqv+jT3izfSphA4=
63 | olympos.io/encoding/edn v0.0.0-20200308123125-93e3b8dd0e24/go.mod h1:oVgVk4OWVDi43qWBEyGhXgYxt7+ED4iYNpTngSLX2Iw=
64 |
--------------------------------------------------------------------------------
/internal/bs3/bs3.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | package bs3
4 |
5 | import (
6 | "encoding/binary"
7 | "sync"
8 | "time"
9 |
10 | "github.com/rs/zerolog/log"
11 |
12 | "github.com/asch/bs3/internal/bs3/key"
13 | "github.com/asch/bs3/internal/bs3/mapproxy"
14 | "github.com/asch/bs3/internal/bs3/mapproxy/sectormap"
15 | "github.com/asch/bs3/internal/bs3/objproxy"
16 | "github.com/asch/bs3/internal/bs3/objproxy/s3"
17 | "github.com/asch/bs3/internal/config"
18 | )
19 |
20 | const (
21 | // Size of the metadata for one write in the write chunk read from the
22 | // kernel.
23 | WRITE_ITEM_SIZE = 32
24 |
25 | // Key representing the object where serialized version of map is
26 | // stored.
27 | checkpointKey = -1
28 |
29 | // Typical number of extents per object for precise memory allocation
30 | // for return values. In the worst case reallocation happens.
31 | typicalExtentsPerObject = 128
32 |
33 | // Sector is a linux constant, which is always 512, no matter how big your sectors or blocks
34 | // are. Please be careful since the terminology is ambiguous.
35 | sectorUnit = 512
36 | )
37 |
38 | // bs3 implements BuseReadWriter interface which can be passed to the buse
39 | // package. Buse package wraps the communication with the BUSE kernel module
40 | // and does all the necessary configuration and low level operations.
41 | //
42 | // bs3 uses s3 protocol to communicate with the storage backend (most probably
43 | // aws s3) but it can be anything else. It manages the mapping between local
44 | // device and remote backend and performs all the operations for correct
45 | // functionality. The default structure is sectormap but it can be changed
46 | // trivially.
47 | type bs3 struct {
48 | // Proxy struct for the operations on objects like uploads, downloads
49 | // etc. Proxy structs are used for serialization and prioritization of
50 | // requests.
51 | objectStoreProxy objproxy.ObjectProxy
52 |
53 | // Proxy struct for the operations on extent map like updates, lookups
54 | // etc. Proxy structs are used for serialization and prioritization of
55 | // requests.
56 | extentMapProxy mapproxy.ExtentMapProxy
57 |
58 | // Data private to the garbage collection process.
59 | gcData struct {
60 | // Reference counter of objects which are actually downloaded
61 | // and hence cannot be deleted from the storage backend.
62 | refcounter map[int64]int64
63 |
64 | // Lock guarding the refcounter.
65 | reflock sync.Mutex
66 | }
67 |
68 | // Size of the metadata for one write in the write chunk read from the
69 | // kernel.
70 | write_item_size int
71 |
72 | // Size of the object portion which contains all writes metadata in the
73 | // chunk from the kernel. After this metadata_size offset real data are
74 | // stored.
75 | metadata_size int
76 | }
77 |
78 | // Returns bs3 with default configuration, i.e. with s3 as a communication
79 | // protocol and sectormap as an extent map.
80 | func NewWithDefaults() (*bs3, error) {
81 | s3Handler, err := s3.New(s3.Options{
82 | Remote: config.Cfg.S3.Remote,
83 | Region: config.Cfg.S3.Region,
84 | AccessKey: config.Cfg.S3.AccessKey,
85 | SecretKey: config.Cfg.S3.SecretKey,
86 | Bucket: config.Cfg.S3.Bucket,
87 | })
88 |
89 | if err != nil {
90 | return nil, err
91 | }
92 |
93 | mapSize := config.Cfg.Size / int64(config.Cfg.BlockSize)
94 | bs3 := New(s3Handler, sectormap.New(mapSize))
95 |
96 | return bs3, nil
97 | }
98 |
99 | // Returns bs3 with provided protocol for communication with backend storage
100 | // and extentMap for keeping the mapping between local device and remote
101 | // backend.
102 | func New(objectStore objproxy.ObjectUploadDownloaderAt, extentMap mapproxy.ExtentMapper) *bs3 {
103 | bs3 := bs3{
104 | objectStoreProxy: objproxy.New(
105 | objectStore, config.Cfg.S3.Uploaders, config.Cfg.S3.Downloaders,
106 | time.Duration(config.Cfg.GC.IdleTimeoutMs)*time.Millisecond),
107 |
108 | extentMapProxy: mapproxy.New(
109 | extentMap, time.Duration(config.Cfg.GC.IdleTimeoutMs)*time.Millisecond),
110 |
111 | metadata_size: config.Cfg.Write.ChunkSize / config.Cfg.BlockSize * WRITE_ITEM_SIZE,
112 |
113 | write_item_size: WRITE_ITEM_SIZE,
114 | }
115 |
116 | bs3.gcData.refcounter = make(map[int64]int64)
117 |
118 | return &bs3
119 | }
120 |
121 | // Handle writes comming from the buse library. writes contain number write
122 | // commands in this call and chunk contains memory where these commands are
123 | // stored together with their data. First part of the chunk are metadata, until
124 | // metadata_size and the rest are data of all writes in the same order.
125 | //
126 | // We read all the writes metadata, create a list and pass it to the extent map
127 | // to update the mapping. Before we actually do that, we wait until the whole
128 | // chunk us uploaded with generated key, which is just one more than the
129 | // previous one.
130 | func (b *bs3) BuseWrite(writes int64, chunk []byte) error {
131 | key := key.Next()
132 |
133 | metadata := chunk[:b.metadata_size]
134 | extents := make([]mapproxy.Extent, writes)
135 |
136 | var writtenTotalBlocks uint64
137 | for i := int64(0); i < writes; i++ {
138 | e := parseExtent(metadata[:b.write_item_size])
139 | extents[i] = e
140 | metadata = metadata[b.write_item_size:]
141 | writtenTotalBlocks += uint64(e.Length)
142 | }
143 |
144 | // Zero out the rest of the space reserved for writes. This is because
145 | // of recovery process, where we lose information about size of the
146 | // metadata.
147 | for i := 0; i < len(metadata); i++ {
148 | metadata[i] = 0
149 | }
150 |
151 | dataSize := writtenTotalBlocks * uint64(config.Cfg.BlockSize)
152 | object := chunk[:uint64(b.metadata_size)+dataSize]
153 |
154 | // Some s3 backends, like minio just drops connection when they are
155 | // under load. Hence the loop with exponential backoff till the
156 | // operation succeeds. There is no point to return error, since the
157 | // best thing we can do is to try infinitely and print a message to
158 | // log.
159 | for i := 1; ; i *= 2 {
160 | err := b.objectStoreProxy.Upload(key, object, true)
161 | if err == nil {
162 | break
163 | }
164 | log.Info().Err(err).Send()
165 | time.Sleep(time.Duration(i) * time.Second)
166 | }
167 |
168 | b.extentMapProxy.Update(extents, int64(b.metadata_size/config.Cfg.BlockSize), key)
169 |
170 | return nil
171 | }
172 |
173 | // Download part of the object to the memory buffer chunk. The part is
174 | // specified by part and it is necessary to call wg.Done() when the upload is
175 | // finished.
176 | func (b *bs3) downloadObjectPart(part mapproxy.ObjectPart, chunk []byte, wg *sync.WaitGroup) {
177 | defer wg.Done()
178 |
179 | // Some s3 backends, like minio just drops connection when they are
180 | // under load. Hence the loop with exponential backoff till the
181 | // operation succeeds. There is no point to return error, since the
182 | // best thing we can do is to try infinitely and print a message to
183 | // log.
184 | for i := 1; ; i *= 2 {
185 | err := b.objectStoreProxy.Download(part.Key, chunk, part.Sector*int64(config.Cfg.BlockSize), true)
186 | if err == nil {
187 | break
188 | }
189 | log.Info().Err(err).Send()
190 | time.Sleep(time.Duration(i) * time.Second)
191 | }
192 | }
193 |
194 | // Read extent starting at sector with length length to the buffer chunk.
195 | // Length of the chunk is the same as length variable. This function consults
196 | // the extent map and asynchronously downloads all needed pieces to reconstruct
197 | // the logical extent.
198 | func (b *bs3) BuseRead(sector, length int64, chunk []byte) error {
199 | objectPieces := b.getObjectPiecesRefCounterInc(sector, length)
200 |
201 | var wg sync.WaitGroup
202 | for _, op := range objectPieces {
203 | size := op.Length * int64(config.Cfg.BlockSize)
204 | if op.Key != mapproxy.NotMappedKey {
205 | wg.Add(1)
206 | go b.downloadObjectPart(op, chunk[:size], &wg)
207 | }
208 | chunk = chunk[size:]
209 | }
210 |
211 | wg.Wait()
212 |
213 | b.objectPiecesRefCounterDec(objectPieces)
214 |
215 | return nil
216 | }
217 |
218 | // Before buse library communicating with the kernel starts, we restore map
219 | // stored on the backend and register signal handler of SIGUSR1 which servers
220 | // for threshold garbage collection. Then we run infinite loop with garbage
221 | // collection deleting just completely dead objects withou any data. It is very
222 | // fast and efficiet and has a huge impact on the backend space utilization.
223 | // Hence we run it continuously.
224 | func (b *bs3) BusePreRun() {
225 | if !config.Cfg.SkipCheckpoint {
226 | b.restore()
227 | }
228 |
229 | b.registerSigUSR1Handler()
230 |
231 | go b.gcDead()
232 | }
233 |
234 | // After disconnecting from the kernel module and just before shuting the
235 | // daemon down we save the map to the backend so it can be restored during next
236 | // start and mapping is not lost.
237 | func (b *bs3) BusePostRemove() {
238 | if !config.Cfg.SkipCheckpoint {
239 | b.checkpoint()
240 | }
241 | }
242 |
243 | // Returns object pieces for reconstructing logical extent but before that
244 | // safely increments the refcounter for the objects. Objects in refcounter are
245 | // excluded from garbage collection.
246 | func (b *bs3) getObjectPiecesRefCounterInc(sector, length int64) []mapproxy.ObjectPart {
247 | b.gcData.reflock.Lock()
248 | defer b.gcData.reflock.Unlock()
249 |
250 | objectPieces := b.extentMapProxy.Lookup(int64(sector), int64(length))
251 |
252 | for _, op := range objectPieces {
253 | b.gcData.refcounter[op.Key]++
254 | }
255 |
256 | return objectPieces
257 | }
258 |
259 | // Decrements the refcounter for the object pieces. Objects in refcounter are
260 | // excluded from garbage collection.
261 | func (b *bs3) objectPiecesRefCounterDec(objectPieces []mapproxy.ObjectPart) {
262 | b.gcData.reflock.Lock()
263 |
264 | for _, op := range objectPieces {
265 | b.gcData.refcounter[op.Key]--
266 | }
267 |
268 | b.gcData.reflock.Unlock()
269 | }
270 |
271 | // Restores the map from the checkpoint saved on the backend and updates the
272 | // current object key accordingly. If it exists.
273 | func (b *bs3) restoreFromCheckpoint() {
274 | mapSize, err := b.objectStoreProxy.Instance.GetObjectSize(checkpointKey)
275 | if err == nil {
276 | log.Info().Msg("->Checkpoint found. Checkpoint recovery started.")
277 |
278 | compressedMap := make([]byte, mapSize)
279 | b.objectStoreProxy.Download(checkpointKey, compressedMap, 0, false)
280 | newKey := b.extentMapProxy.Instance.DeserializeAndReturnNextKey(compressedMap)
281 | key.Replace(newKey)
282 |
283 | log.Info().Msgf("->Checkpoint recovery process finished. Last object from checkpoint is %d.", newKey)
284 | }
285 | }
286 |
287 | // Restores the map from individual objects. It reconstructs the map replaying
288 | // all the writes from metadata part of continuous sequence of objects until a
289 | // missing object is found. This is the point where prefix consistency is
290 | // corrupted and we cannot recover more. Any successive objects are deleted.
291 | func (b *bs3) restoreFromObjects() {
292 | log.Info().Msg("->Looking for objects to do roll forward recovery.")
293 |
294 | keyBefore := key.Current()
295 | for ; ; key.Next() {
296 | header := make([]byte, b.metadata_size)
297 | size, err := b.objectStoreProxy.Instance.GetObjectSize(key.Current())
298 | if err != nil {
299 | // Prefix consistency broken.
300 | break
301 | }
302 | if size == 0 {
303 | // Garbage collected object, that is OK, prefix
304 | // consistency kept.
305 | continue
306 | }
307 |
308 | // Get writes metadata for object.
309 | err = b.objectStoreProxy.Instance.DownloadAt(key.Current(), header, 0)
310 | if err != nil {
311 | break
312 | }
313 |
314 | // Replay all writes from metadata part until extent with
315 | // length 0 is found. It is invalid value and it means that the
316 | // memory is zeroed, which means end of the metadata section of
317 | // the object. The memory is zeroed out in BuseWrite function
318 | // where the object is uploaded.
319 | extents := make([]mapproxy.Extent, 0, typicalExtentsPerObject)
320 | for {
321 | e := parseExtent(header[:b.write_item_size])
322 | if e.Length == 0 {
323 | break
324 | }
325 | extents = append(extents, e)
326 | header = header[b.write_item_size:]
327 | }
328 |
329 | dataBegin := int64(b.metadata_size / config.Cfg.BlockSize)
330 | b.extentMapProxy.Update(extents, dataBegin, key.Current())
331 | }
332 |
333 | if keyBefore == key.Current() {
334 | log.Info().Msg("->No extra objects found for roll forward recovery.")
335 | } else {
336 | log.Info().Msgf("->Extra %d objects for roll forward recovery found.", key.Current()-keyBefore)
337 | }
338 | }
339 |
340 | // Restores map from saved checkpoint and then continuous in restoration from
341 | // individual objects. E.g. when crash happens, checkpoint is not uploaded
342 | // hence the old checkpoint is read. However there can already be uploaded new
343 | // set of objects fulfilling prefix consistency.
344 | func (b *bs3) restore() {
345 | log.Info().Msgf("Checking for old volume in bucket %s.", config.Cfg.S3.Bucket)
346 |
347 | b.restoreFromCheckpoint()
348 | b.restoreFromObjects()
349 | b.objectStoreProxy.Instance.DeleteKeyAndSuccessors(key.Current())
350 |
351 | if key.Current() == 0 {
352 | log.Info().Msgf("No volume found. Bucket %s is used for new volume.", config.Cfg.S3.Bucket)
353 | } else {
354 | log.Info().Msgf("Volume found in bucket %s. The last object is %d.", config.Cfg.S3.Bucket, key.Current())
355 | }
356 | }
357 |
358 | // Serializes extent map and upload it to the backend.
359 | func (b *bs3) checkpoint() {
360 | log.Info().Msg("Checkpointing started.")
361 |
362 | log.Info().Msg("->Serialization of extent map started.")
363 | dump := b.extentMapProxy.Instance.Serialize()
364 | log.Info().Msg("->Serialization of extent map finished.")
365 |
366 | log.Info().Msg("->Upload of extent map started.")
367 | b.objectStoreProxy.Upload(checkpointKey, dump, false)
368 | log.Info().Msg("->Upload of extent map finished.")
369 |
370 | log.Info().Msgf("Checkpointing finished. Last checkpointed object is %d.", key.Current())
371 | }
372 |
373 | // Parses write extent information from 32 bytes of raw memory. The memory is
374 | // one write in metadata section of the object.
375 | func parseExtent(b []byte) mapproxy.Extent {
376 | return mapproxy.Extent{
377 | Sector: int64(binary.LittleEndian.Uint64(b[:8]) * sectorUnit / uint64(config.Cfg.BlockSize)),
378 | Length: int64(binary.LittleEndian.Uint64(b[8:16]) * sectorUnit / uint64(config.Cfg.BlockSize)),
379 | SeqNo: int64(binary.LittleEndian.Uint64(b[16:24])),
380 | Flag: int64(binary.LittleEndian.Uint64(b[24:32])),
381 | }
382 | }
383 |
--------------------------------------------------------------------------------
/internal/bs3/doc.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // bs3 is a userspace daemon using golang buse library to create a block
4 | // device. All operations on the block device are handled by the daemon. bs3
5 | // stores data in object storage via s3 protocol and maintains the mapping
6 | // between logical block device space and the backend.
7 | //
8 | // bs3 defines two interfaces. One for the extent map and one for the storage
9 | // backend operations. These two parts can be trivially changed just by
10 | // implementing corresponding interface.
11 | package bs3
12 |
--------------------------------------------------------------------------------
/internal/bs3/gc.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | package bs3
4 |
5 | import (
6 | "encoding/binary"
7 | "os"
8 | "os/signal"
9 | "sync"
10 | "syscall"
11 | "time"
12 |
13 | "github.com/asch/bs3/internal/bs3/key"
14 | "github.com/asch/bs3/internal/bs3/mapproxy"
15 | "github.com/asch/bs3/internal/config"
16 |
17 | "github.com/rs/zerolog/log"
18 | )
19 |
20 | const (
21 | // Typical number of newly created objects during one threshold GC run.
22 | // Just an optimization of memory allocation, in the worst case
23 | // reallocation occurs.
24 | typicalNewObjectsPerGC = 64
25 |
26 | // Typical number of extents per one garbage collected object. Just an
27 | // optimization of memory allocation, in the worst case reallocation
28 | // occurs.
29 | typicalExtentsPerGCObject = 64
30 | )
31 |
32 | // Select objects viable for threshold GC. When an object utilization is under
33 | // the threshold it is selected for GC. The object with the highest key is
34 | // never collected because of oscilation.
35 | func (b *bs3) filterKeysToCollect(utilization map[int64]int64, ratio float64) map[int64]struct{} {
36 | var maxKey int64
37 | collect := make(map[int64]struct{})
38 |
39 | for k, v := range utilization {
40 | used := v * int64(config.Cfg.BlockSize)
41 | r := float64(used) / float64(config.Cfg.Write.ChunkSize)
42 | if r < ratio {
43 | collect[k] = struct{}{}
44 | }
45 |
46 | if k > maxKey {
47 | maxKey = k
48 | }
49 | }
50 |
51 | if _, ok := collect[maxKey]; ok {
52 | delete(collect, maxKey)
53 | }
54 |
55 | return collect
56 | }
57 |
58 | // Constructs the list of life extents to be saved from objects subjected to the GC.
59 | func (b *bs3) getCompleteWriteList(keys map[int64]struct{}, stepSize int64) []mapproxy.ExtentWithObjectPart {
60 | completeWriteList := make([]mapproxy.ExtentWithObjectPart, 0, 128)
61 |
62 | sectors := config.Cfg.Size / int64(config.Cfg.BlockSize)
63 |
64 | for i := int64(0); i < sectors; i += stepSize {
65 | ci := b.extentMapProxy.ExtentsInObjects(int64(i), stepSize, keys)
66 |
67 | if len(ci) == 0 {
68 | continue
69 | }
70 |
71 | completeWriteList = append(completeWriteList, ci...)
72 |
73 | }
74 |
75 | return completeWriteList
76 | }
77 |
78 | // Removes currently downloaded objects from the list of dead objects.
79 | func (b *bs3) filterDownloadingObjects(deadObjects map[int64]struct{}) {
80 | b.gcData.reflock.Lock()
81 | defer b.gcData.reflock.Unlock()
82 |
83 | for k, v := range b.gcData.refcounter {
84 | if v == 0 {
85 | delete(b.gcData.refcounter, k)
86 | } else {
87 | _, ok := deadObjects[k]
88 | if ok {
89 | delete(deadObjects, k)
90 | }
91 | }
92 | }
93 | }
94 |
95 | // Runs threshold GC. It makes all objects with live data ratio under the
96 | // threshold dead by copying their live data into new object. These objects are
97 | // deleted during the regular dead GC run.
98 | func (b *bs3) gcThreshold(stepSize int64, threshHold float64) {
99 | liveObjects := b.extentMapProxy.ObjectsUtilization()
100 | keysToCollect := b.filterKeysToCollect(liveObjects, threshHold)
101 | completeWritelist := b.getCompleteWriteList(keysToCollect, stepSize)
102 | objects, extents := b.composeObjects(completeWritelist)
103 |
104 | for i := range objects {
105 | key := key.Next()
106 |
107 | err := b.objectStoreProxy.Upload(key, objects[i], false)
108 | if err != nil {
109 | log.Info().Err(err).Send()
110 | }
111 |
112 | b.extentMapProxy.Update(extents[i], int64(b.metadata_size/config.Cfg.BlockSize), key)
113 | }
114 | }
115 |
116 | // Removes unneeded dead objects from the map and upload empty object instead.
117 | // The object cannot be deleted on the backend, because the sequence number
118 | // would be missing in the recovery process where we need continuous range of
119 | // keys.
120 | func (b *bs3) removeNonReferencedDeadObjects() {
121 | deadObjects := b.extentMapProxy.DeadObjects()
122 | b.filterDownloadingObjects(deadObjects)
123 | for k := range deadObjects {
124 | err := b.objectStoreProxy.Upload(k, []byte{}, false)
125 | if err != nil {
126 | log.Info().Err(err).Send()
127 | }
128 | }
129 | b.extentMapProxy.DeleteDeadObjects(deadObjects)
130 | }
131 |
132 | // Register SIGUSR1 as a trigger for threshold GC.
133 | func (b *bs3) registerSigUSR1Handler() {
134 | gcChan := make(chan os.Signal, 1)
135 | signal.Notify(gcChan, syscall.SIGUSR1)
136 |
137 | go func() {
138 | for range gcChan {
139 | log.Info().Msgf("Threshold GC started with threshold %1.2f.", config.Cfg.GC.LiveData)
140 | b.gcThreshold(config.Cfg.GC.Step, config.Cfg.GC.LiveData)
141 | log.Info().Msg("Threshold GC finished.")
142 | }
143 | }()
144 | }
145 |
146 | // Dead GC infinite loop. Highly efficient hence running regularly.
147 | func (b *bs3) gcDead() {
148 | for {
149 | time.Sleep(time.Duration(config.Cfg.GC.Wait) * time.Second)
150 |
151 | log.Trace().Msg("Dead GC started.")
152 | b.removeNonReferencedDeadObjects()
153 | log.Trace().Msg("Dead GC finished.")
154 | }
155 | }
156 |
157 | // Stores raw values of individual write into metadata part of the object.
158 | func writeHeader(metadataFrontier int, g mapproxy.ExtentWithObjectPart, object []byte) {
159 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.ObjectPart.Sector))
160 | metadataFrontier += 8
161 |
162 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.Length))
163 | metadataFrontier += 8
164 |
165 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.SeqNo))
166 | metadataFrontier += 8
167 |
168 | binary.LittleEndian.PutUint64(object[metadataFrontier:], uint64(g.Extent.Flag))
169 | metadataFrontier += 8
170 | }
171 |
172 | // Traverse the list of all extents which are going to be copied into new fresh
173 | // object(s). It downloads necessary parts and constructs new objects for the
174 | // complete list. All objects are then uploaded and map updated.
175 | func (b *bs3) composeObjects(writeList []mapproxy.ExtentWithObjectPart) ([][]byte, [][]mapproxy.Extent) {
176 | var wg sync.WaitGroup
177 |
178 | metadataFrontier := 0
179 | dataFrontier := b.metadata_size
180 |
181 | objects := make([][]byte, 0, typicalNewObjectsPerGC)
182 | extents := make([][]mapproxy.Extent, 0, typicalNewObjectsPerGC)
183 |
184 | object := make([]byte, config.Cfg.Write.ChunkSize)
185 | currentObjectExtents := make([]mapproxy.Extent, 0, typicalExtentsPerGCObject)
186 |
187 | for _, g := range writeList {
188 | if uint64(dataFrontier)+uint64(g.Extent.Length)*uint64(config.Cfg.BlockSize) > uint64(config.Cfg.Write.ChunkSize) {
189 | objects = append(objects, object)
190 | extents = append(extents, currentObjectExtents)
191 |
192 | object = make([]byte, config.Cfg.Write.ChunkSize)
193 | currentObjectExtents = make([]mapproxy.Extent, 0, typicalExtentsPerGCObject)
194 |
195 | metadataFrontier = 0
196 | dataFrontier = b.metadata_size
197 | }
198 |
199 | writeHeader(metadataFrontier, g, object)
200 | metadataFrontier += b.write_item_size
201 |
202 | data := object[dataFrontier : int64(dataFrontier)+g.Extent.Length*int64(config.Cfg.BlockSize)]
203 | wg.Add(1)
204 | go func(g mapproxy.ExtentWithObjectPart) {
205 | defer wg.Done()
206 | err := b.objectStoreProxy.Download(g.ObjectPart.Key, data, g.Extent.Sector*int64(config.Cfg.BlockSize), true)
207 | if err != nil {
208 | log.Info().Err(err).Send()
209 | }
210 | }(g)
211 |
212 | extent := mapproxy.Extent{
213 | Sector: g.ObjectPart.Sector,
214 | Length: g.Extent.Length,
215 | SeqNo: g.Extent.SeqNo,
216 | Flag: g.Extent.Flag,
217 | }
218 |
219 | currentObjectExtents = append(currentObjectExtents, extent)
220 | dataFrontier += int(g.Extent.Length) * config.Cfg.BlockSize
221 | }
222 |
223 | if len(currentObjectExtents) > 0 {
224 | objects = append(objects, object)
225 | extents = append(extents, currentObjectExtents)
226 | }
227 |
228 | wg.Wait()
229 |
230 | return objects, extents
231 | }
232 |
--------------------------------------------------------------------------------
/internal/bs3/key/key.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Package for synchronized access to the object key counter.
4 | package key
5 |
6 | import (
7 | "sync"
8 | )
9 |
10 | var (
11 | key int64
12 | mutex sync.Mutex
13 | )
14 |
15 | // Returns value of currently unassigned key. It is forbidden to use this key
16 | // for creating a new object withou calling Next() function. I.e. this key can
17 | // be used for the next object.
18 | func Current() int64 {
19 | mutex.Lock()
20 | defer mutex.Unlock()
21 |
22 | return key
23 | }
24 |
25 | // Returns value of currently unassigned key and increments, hence the key
26 | // variable contains unassigned key again.. I.e. this key can be used for the
27 | // next object.
28 | func Next() int64 {
29 | mutex.Lock()
30 | defer mutex.Unlock()
31 |
32 | tmp := key
33 | key++
34 |
35 | return tmp
36 | }
37 |
38 | // Replaces the value of the next unassigned key.
39 | func Replace(newKey int64) {
40 | mutex.Lock()
41 | defer mutex.Unlock()
42 |
43 | key = newKey
44 | }
45 |
--------------------------------------------------------------------------------
/internal/bs3/mapproxy/mapproxy.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Mapproxy package is a proxy for structs with ExtentMapper interface. It
4 | // serializes and prioritizes requests coming to the ExtentMapper and also
5 | // improves cache locality since all operations are done by the same go
6 | // routine.
7 | package mapproxy
8 |
9 | import (
10 | "time"
11 | )
12 |
13 | const (
14 | NotMappedKey = -1
15 | )
16 |
17 | // Provides mapping from logical extents presented in the system to the
18 | // potentionaly mutliple extents in the backend storage. Furthermore it has to
19 | // be provide multiple operations related to garbage collection and map
20 | // restoration.
21 | type ExtentMapper interface {
22 | Update(extents []Extent, startOfDataSectors, key int64)
23 | Lookup(sector, length int64) []ObjectPart
24 | FindExtentsWithKeys(sector, length int64, keys map[int64]struct{}) []ExtentWithObjectPart
25 | DeleteFromDeadObjects(deadObjects map[int64]struct{})
26 | DeleteFromUtilization(keys map[int64]struct{})
27 | GetMaxKey() int64
28 | ObjectsUtilization() map[int64]int64
29 | DeadObjects() map[int64]struct{}
30 | DeserializeAndReturnNextKey(buf []byte) int64
31 | Serialize() []byte
32 | }
33 |
34 | // Proxy to the ExtentMapper. It serializes and prioritizes requests comming to
35 | // the extent map and also improves cache locality since the map is always
36 | // traversed by the same thread.
37 | type ExtentMapProxy struct {
38 | Instance ExtentMapper
39 |
40 | // Timeout after which low priority request can be handled.
41 | idleTimeout time.Duration
42 |
43 | // Channels for internal communication specific to one type of request.
44 | updateChan chan updateRequest
45 | lookupChan chan lookupRequest
46 | keyedExtentsChan chan keyedExtentsRequest
47 |
48 | // General low priority channel used for multiple types of requests.
49 | lockChan chan lockRequest
50 | }
51 |
52 | // Mapping from the logical extent to the extent in the object.
53 | type ExtentWithObjectPart struct {
54 | Extent Extent
55 | ObjectPart ObjectPart
56 | }
57 |
58 | // Logical extent representation representing the system view.
59 | type Extent struct {
60 | // Beginnig of the extent.
61 | Sector int64
62 |
63 | // Length of the extent. Extent is continuous.
64 | Length int64
65 |
66 | // Sequential number of write which wrote this extent
67 | SeqNo int64
68 |
69 | // Reserved for future usage.
70 | Flag int64
71 | }
72 |
73 | // Object part is extent in the object.
74 | type ObjectPart struct {
75 | // First sector of the extent.
76 | Sector int64
77 |
78 | // Length of the extent. Extent is continuous.
79 | Length int64
80 |
81 | // Object where the extent is located.
82 | Key int64
83 | }
84 |
85 | // Returns proxy which can be directly used. It spawns one worker which handles
86 | // all serialized and prioritized requests.
87 | func New(instance ExtentMapper, idleTimeout time.Duration) ExtentMapProxy {
88 | updateChan := make(chan updateRequest)
89 | lookupChan := make(chan lookupRequest)
90 | keyedExtentsChan := make(chan keyedExtentsRequest)
91 | lockChan := make(chan lockRequest)
92 |
93 | m := ExtentMapProxy{
94 | Instance: instance,
95 | idleTimeout: idleTimeout,
96 | updateChan: updateChan,
97 | lookupChan: lookupChan,
98 | keyedExtentsChan: keyedExtentsChan,
99 | lockChan: lockChan,
100 | }
101 |
102 | go m.worker()
103 |
104 | return m
105 | }
106 |
107 | // Updates all extents specified in extents. startOfDataSectors is the first
108 | // sector in the object with real data and key is the key of the object.
109 | func (p *ExtentMapProxy) Update(extents []Extent, startOfDataSectors, key int64) {
110 | done := make(chan struct{})
111 | p.updateChan <- updateRequest{extents, startOfDataSectors, key, done}
112 | <-done
113 | }
114 |
115 | // Finds all pieces from which the logical extent starting from sector with
116 | // length length can be reconstructed.
117 | func (p *ExtentMapProxy) Lookup(sector, length int64) []ObjectPart {
118 | reply := make(chan []ObjectPart)
119 | p.lookupChan <- lookupRequest{sector, length, reply}
120 | return <-reply
121 | }
122 |
123 | // Finds all extents which are stored in any of the objects with keys in keys.
124 | // Sector and length is the range of interest.
125 | func (p *ExtentMapProxy) ExtentsInObjects(sector, length int64, keys map[int64]struct{}) []ExtentWithObjectPart {
126 | reply := make(chan []ExtentWithObjectPart)
127 | p.keyedExtentsChan <- keyedExtentsRequest{sector, length, keys, reply}
128 | return <-reply
129 | }
130 |
131 | // Returns all dead objects. I.e. objects without any live data.
132 | func (p *ExtentMapProxy) DeadObjects() map[int64]struct{} {
133 | done := make(chan struct{})
134 | p.lockChan <- lockRequest{done}
135 | tmp := p.Instance.DeadObjects()
136 | <-done
137 |
138 | return tmp
139 | }
140 |
141 | // Returns all objects utilization. I.e. number of non-dead sectors in each
142 | // non-dead object.
143 | func (p *ExtentMapProxy) ObjectsUtilization() map[int64]int64 {
144 | done := make(chan struct{})
145 | p.lockChan <- lockRequest{done}
146 | tmp := p.Instance.ObjectsUtilization()
147 | <-done
148 |
149 | return tmp
150 | }
151 |
152 | // Returns highest object key contained in the map.
153 | func (p *ExtentMapProxy) GetMaxKey() int64 {
154 | done := make(chan struct{})
155 | p.lockChan <- lockRequest{done}
156 | tmp := p.Instance.GetMaxKey()
157 | <-done
158 |
159 | return tmp
160 |
161 | }
162 |
163 | // Deletes all provided keys from object utilization list.
164 | func (p *ExtentMapProxy) DeleteFromUtilization(keys map[int64]struct{}) {
165 | done := make(chan struct{})
166 | p.lockChan <- lockRequest{done}
167 | defer func() {
168 | <-done
169 | }()
170 |
171 | p.Instance.DeleteFromUtilization(keys)
172 | }
173 |
174 | // Deletes all dead objects from dead objects list.
175 | func (p *ExtentMapProxy) DeleteDeadObjects(deadObjects map[int64]struct{}) {
176 | done := make(chan struct{})
177 | p.lockChan <- lockRequest{done}
178 | defer func() {
179 | <-done
180 | }()
181 |
182 | p.Instance.DeleteFromDeadObjects(deadObjects)
183 | }
184 |
185 | type updateRequest struct {
186 | extents []Extent
187 | startOfDataSectors int64
188 | key int64
189 | done chan struct{}
190 | }
191 |
192 | // Internal request structures just for wrapping the function calls into the
193 | // channel communication.
194 |
195 | type lookupRequest struct {
196 | sector int64
197 | length int64
198 | reply chan []ObjectPart
199 | }
200 |
201 | type keyedExtentsRequest struct {
202 | sector int64
203 | length int64
204 | keys map[int64]struct{}
205 | reply chan<- []ExtentWithObjectPart
206 | }
207 |
208 | type lockRequest struct {
209 | done chan struct{}
210 | }
211 |
212 | // Worker is doing prioritization and serialization of the requests. Updates
213 | // and lookups into the map have highest priority. All other request are low
214 | // priority.
215 | func (p *ExtentMapProxy) worker() {
216 | for {
217 | select {
218 | case u := <-p.updateChan:
219 | p.update(u)
220 |
221 | case l := <-p.lookupChan:
222 | p.lookup(l)
223 |
224 | //case <-time.NewTicker(m.idleTimeout).C:
225 | default:
226 | select {
227 | case u := <-p.updateChan:
228 | p.update(u)
229 |
230 | case l := <-p.lookupChan:
231 | p.lookup(l)
232 |
233 | case e := <-p.keyedExtentsChan:
234 | p.findExtensWithKeys(e)
235 |
236 | case l := <-p.lockChan:
237 | l.done <- struct{}{}
238 | }
239 | }
240 | }
241 | }
242 |
243 | func (p *ExtentMapProxy) update(r updateRequest) {
244 | p.Instance.Update(r.extents, r.startOfDataSectors, r.key)
245 | r.done <- struct{}{}
246 | }
247 |
248 | func (p *ExtentMapProxy) lookup(r lookupRequest) {
249 | r.reply <- p.Instance.Lookup(r.sector, r.length)
250 | }
251 |
252 | func (p *ExtentMapProxy) findExtensWithKeys(r keyedExtentsRequest) {
253 | r.reply <- p.Instance.FindExtentsWithKeys(r.sector, r.length, r.keys)
254 | }
255 |
--------------------------------------------------------------------------------
/internal/bs3/mapproxy/sectormap/sectormap.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Sectormap package provides implementation of ExtentMapper interface. It
4 | // implements high efficient mapping with sector granularity. More details are
5 | // in the SectorMap struct description.
6 | package sectormap
7 |
8 | import (
9 | "bytes"
10 | "encoding/gob"
11 |
12 | "github.com/asch/bs3/internal/bs3/mapproxy"
13 | )
14 |
15 | const (
16 | // How many objects parts is the typical result for one extent lookup.
17 | // This is just for initial allocation of the returned array. In the
18 | // worst case reallocation happens.
19 | typicalObjectPartsPerLookup = 64
20 |
21 | notMappedKey = -1
22 | )
23 |
24 | // Description of the sector. It provides information about corresponding
25 | // sector in the object and object identification.
26 | type SectorMetadata struct {
27 | // Sector in the object.
28 | Sector int64
29 |
30 | // Key of the object.
31 | Key int64
32 |
33 | // Sequential number of the last write to this sector.
34 | SeqNo int64
35 |
36 | // Reserved for future usage.
37 | Flag int64
38 | }
39 |
40 | // Implementation of the ExtentMapper interface hence serving as and extent map. This is high
41 | // efficient mapping of individual sectors stored in an continuous array. The obvious advantage is
42 | // speed, where linear scanning of array is incredibly fast operation on modern CPUs. The second
43 | // advantage is simplicity. The disadvantage can be that it consumes still the same amount of
44 | // memory, no matter how used the device is. However the worst case memory usage is the best
45 | // possible because we don't store any additional data like in some more complex data structures
46 | // like trees.
47 | //
48 | // Nevertheless if the memory usage is a problem, slightly raising the sector size helps
49 | // tremendously. 4k sectors are norm today and if we have 1TB block device the map consumes
50 | // 1TB/4k*32 = 8GB. With 8k sectors it is just 4GB. This can be further reduced by shrinking data
51 | // types in SectorMetadata structure from int64 which is an overkill for most of them.
52 | //
53 | // This structure is serialized by gobs hence it has to be exported and all its attributes as well.
54 | type SectorMap struct {
55 | Sectors []SectorMetadata
56 | ObjUtilizations map[int64]int64
57 | DeadObjs map[int64]struct{}
58 | }
59 |
60 | // Returns new instance of the sector map. The map should not be used directly because it does not
61 | // support concurrent access.
62 | func New(length int64) *SectorMap {
63 | sectors := make([]SectorMetadata, length)
64 | objectUtilization := make(map[int64]int64)
65 | deadObjects := make(map[int64]struct{})
66 |
67 | for i := range sectors {
68 | sectors[i].Key = notMappedKey
69 | }
70 |
71 | s := SectorMap{
72 | Sectors: sectors,
73 | ObjUtilizations: objectUtilization,
74 | DeadObjs: deadObjects,
75 | }
76 |
77 | return &s
78 | }
79 |
80 | // Updates sectors in the map with new values from extents. startOfDataSectors
81 | // is the first sector with data in the object and key is the key of the
82 | // object.
83 | func (m *SectorMap) Update(extents []mapproxy.Extent, startOfDataSectors, key int64) {
84 | m.ObjUtilizations[key] = 0
85 |
86 | for _, e := range extents {
87 | m.updateExtent(e, startOfDataSectors, key)
88 | startOfDataSectors += e.Length
89 | }
90 |
91 | // Because of GC we can add object which will never update the map
92 | // because all write records are old
93 | if m.ObjUtilizations[key] == 0 {
94 | delete(m.ObjUtilizations, key)
95 | m.DeadObjs[key] = struct{}{}
96 | }
97 | }
98 |
99 | // Updates the information about objects utilizations for given sector.
100 | func (m *SectorMap) updateUtilization(key int64, s *SectorMetadata) {
101 | // Increment cannot be done at once because GC can
102 | // introduce object with writes with lower seqNo
103 | m.ObjUtilizations[key]++
104 | if s.Key != notMappedKey {
105 | m.ObjUtilizations[s.Key]--
106 | if m.ObjUtilizations[s.Key] == 0 {
107 | delete(m.ObjUtilizations, s.Key)
108 | m.DeadObjs[s.Key] = struct{}{}
109 | }
110 | }
111 | }
112 |
113 | // Update one sector.
114 | func (m *SectorMap) updateSector(key int64, s *SectorMetadata, targetSector int64, e mapproxy.Extent) {
115 | m.updateUtilization(key, s)
116 |
117 | s.Sector = targetSector
118 | s.Key = key
119 | s.SeqNo = e.SeqNo
120 | s.Flag = e.Flag
121 | }
122 |
123 | // Updates an extent. It checks whether the write is actually newer than write
124 | // already in the map. Like this we always keep the map consistent.
125 | func (m *SectorMap) updateExtent(e mapproxy.Extent, startOfDataSectors, key int64) {
126 | targetSector := startOfDataSectors
127 | for i := e.Sector; i < e.Sector+e.Length; i++ {
128 | s := &m.Sectors[i]
129 | if s.SeqNo <= e.SeqNo { // Equality because of GC
130 | m.updateSector(key, s, targetSector, e)
131 | }
132 | targetSector++
133 | }
134 | }
135 |
136 | // Returns longest possible extent in the object starting at startSector with
137 | // maximal length length. This means that the extent has the same key and
138 | // sequential number.
139 | func (m *SectorMap) getExtent(startSector, length uint64) mapproxy.Extent {
140 | s := m.Sectors[startSector]
141 | e := mapproxy.Extent{
142 | Sector: s.Sector,
143 | Length: 1,
144 | SeqNo: s.SeqNo,
145 | Flag: s.Flag,
146 | }
147 |
148 | for i := startSector + 1; ; i++ {
149 | if i >= uint64(len(m.Sectors)) ||
150 | i >= startSector+length ||
151 | m.Sectors[i].Key != m.Sectors[i-1].Key ||
152 | m.Sectors[i].SeqNo != e.SeqNo ||
153 | m.Sectors[i-1].Sector != m.Sectors[i].Sector-1 {
154 |
155 | break
156 | }
157 |
158 | e.Length++
159 | }
160 |
161 | return e
162 | }
163 |
164 | // Returns all ObjectParts from which extent starting at sector with length
165 | // length can be reconstructed.
166 | func (m *SectorMap) Lookup(sector, length int64) []mapproxy.ObjectPart {
167 | parts := make([]mapproxy.ObjectPart, 0, typicalObjectPartsPerLookup)
168 | s := m.Sectors[sector].Sector
169 | l := int64(1)
170 | for i := int64(1); i < length; i++ {
171 | id := sector + i
172 | // The next sector is not from the same extent. Store part into
173 | // the returned value and begin new extent.
174 | if (m.Sectors[id].Key != m.Sectors[id-1].Key ||
175 | m.Sectors[id].Sector != m.Sectors[id-1].Sector+1) &&
176 | (m.Sectors[id].Key != -1 || m.Sectors[id-1].Key != notMappedKey) {
177 |
178 | parts = append(parts, mapproxy.ObjectPart{
179 | Sector: s,
180 | Length: l,
181 | Key: m.Sectors[id-1].Key,
182 | })
183 | s = m.Sectors[id].Sector
184 | l = 1
185 | } else {
186 | l++
187 | }
188 | }
189 | parts = append(parts, mapproxy.ObjectPart{
190 | Sector: s,
191 | Length: l,
192 | Key: m.Sectors[sector+length-1].Key,
193 | })
194 | return parts
195 | }
196 |
197 | // Returns all extents and objectparts starting from sector with length length
198 | // that are stored in any of keys in keys.
199 | func (m *SectorMap) FindExtentsWithKeys(sector, length int64, keys map[int64]struct{}) []mapproxy.ExtentWithObjectPart {
200 | ci := make([]mapproxy.ExtentWithObjectPart, 0, typicalObjectPartsPerLookup)
201 |
202 | for i := sector; i < sector+length && i < int64(len(m.Sectors)); {
203 | key := m.Sectors[i].Key
204 | _, ok := keys[key]
205 | extent := m.getExtent(uint64(i), uint64(sector+length-i))
206 | if ok {
207 | op := mapproxy.ObjectPart{
208 | Sector: i,
209 | Length: 0,
210 | Key: key,
211 | }
212 | ci = append(ci, mapproxy.ExtentWithObjectPart{
213 | Extent: extent,
214 | ObjectPart: op,
215 | })
216 | }
217 | i += extent.Length
218 | }
219 |
220 | return ci
221 | }
222 |
223 | // Returns copy of deadObjects. These are objects with no valid data which can
224 | // be deleted.
225 | func (m *SectorMap) DeadObjects() map[int64]struct{} {
226 | deadObjects := make(map[int64]struct{})
227 |
228 | for k := range m.DeadObjs {
229 | deadObjects[k] = struct{}{}
230 | }
231 |
232 | return deadObjects
233 | }
234 |
235 | // Returns the highest key from the map.
236 | func (m *SectorMap) GetMaxKey() int64 {
237 | var maxKey int64
238 | for k := range m.ObjUtilizations {
239 | if k > maxKey {
240 | maxKey = k
241 | }
242 | }
243 |
244 | return maxKey
245 | }
246 |
247 | // Return copy of the structure representing the object utilization.
248 | // Utilization is number of non-dead sectors.
249 | func (m *SectorMap) ObjectsUtilization() map[int64]int64 {
250 | objectUtilization := make(map[int64]int64)
251 |
252 | for k, v := range m.ObjUtilizations {
253 | objectUtilization[k] = v
254 | }
255 |
256 | return objectUtilization
257 | }
258 |
259 | // Returns serialized version of the map with go gobs.
260 | func (m *SectorMap) Serialize() []byte {
261 | var buf bytes.Buffer
262 |
263 | encoder := gob.NewEncoder(&buf)
264 | encoder.Encode(m)
265 |
266 | return buf.Bytes()
267 | }
268 |
269 | // Deserialized map from buf which was previously serialized by Serialize(). It
270 | // restored map and structures representing object utilization and dead
271 | // objects. During deserialization all sequential numbers are zeroed because
272 | // most they are not needed and most probably BUSE starts from 0 since it was
273 | // restarted. The map supports device size change.
274 | func (m *SectorMap) DeserializeAndReturnNextKey(buf []byte) int64 {
275 | // Size of the allocated map
276 | intendedSize := len(m.Sectors)
277 |
278 | // 1) In case of smaller checkpointed map, i.e. we enlarged the device,
279 | // the map would be shrinked and we need to resize it to its
280 | // intended size.
281 | // 2) In case of larger checkpointed map, i.e. we shrinked the device,
282 | // the map would be enlarged and we need to resize it to its inteded size.
283 | decoder := gob.NewDecoder(bytes.NewReader(buf))
284 | decoder.Decode(m)
285 |
286 | if intendedSize < len(m.Sectors) {
287 | // Create new map with smaller size and copy the intended range
288 | // to it. Then replace the the map. We could just change the
289 | // len of the map, but then the memory would be still occupied
290 | // like in the case of larger map.
291 | sectors := make([]SectorMetadata, intendedSize)
292 | copy(sectors, m.Sectors)
293 | m.Sectors = sectors
294 | } else {
295 | // We already have allocated large map, but we decoded smaller
296 | // one and it the len was set according to the decoded
297 | // (smaller) map. We just change len to its full size.
298 | m.Sectors = m.Sectors[:cap(m.Sectors)]
299 | }
300 |
301 | var maxKey int64 = notMappedKey
302 | for _, s := range m.Sectors {
303 | if s.Key > maxKey {
304 | maxKey = s.Key
305 | }
306 | }
307 |
308 | for i := range m.Sectors {
309 | m.Sectors[i].SeqNo = 0
310 | }
311 |
312 | return maxKey + 1
313 | }
314 |
315 | // Deletes objects with keys from object utilizations.
316 | func (m *SectorMap) DeleteFromUtilization(keys map[int64]struct{}) {
317 | for k := range keys {
318 | delete(m.ObjUtilizations, k)
319 | }
320 | }
321 |
322 | // Deletes objects with keys from deadObjects from dead objects.
323 | func (m *SectorMap) DeleteFromDeadObjects(deadObjects map[int64]struct{}) {
324 | for k := range deadObjects {
325 | _, ok := m.DeadObjs[k]
326 | if ok {
327 | delete(m.DeadObjs, k)
328 | }
329 | }
330 | }
331 |
--------------------------------------------------------------------------------
/internal/bs3/objproxy/objproxy.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Package objproxy is a proxy for ObjectUploadDownloaderAt which performs
4 | // prioritization of various requests.
5 | package objproxy
6 |
7 | import (
8 | "time"
9 | )
10 |
11 | // Interface for s3 backend storage. Anything implementing this interface can
12 | // be used as a storage backend.
13 | type ObjectUploadDownloaderAt interface {
14 | // Uploads data in buf under the key identifier.
15 | Upload(key int64, buf []byte) error
16 |
17 | // Downloads data into buf starting from offset in the object
18 | // identified by key. The length of buf is the legth of requested data.
19 | DownloadAt(key int64, buf []byte, offset int64) error
20 |
21 | // Returns size in bytes of object identified by key. Needed only for
22 | // garbage collection and extent map recovery. Otherwise can have empty
23 | // implementation.
24 | GetObjectSize(key int64) (int64, error)
25 |
26 | // Deletes object identified by key and all successive objects. Needed
27 | // only for extent map restoration. Otherwise can have empty
28 | // implementation.
29 | DeleteKeyAndSuccessors(key int64) error
30 | }
31 |
32 | // Proxy for the backend storage which prioritizes requests. Requests coming to
33 | // the priority channels are handled first. Like this requests from low
34 | // priority operations like garbage collection do not slow down normal
35 | // operation.
36 | type ObjectProxy struct {
37 | Instance ObjectUploadDownloaderAt
38 |
39 | // Number of go routines to spawn for handling upload requests and
40 | // download requests.
41 | uploaders int
42 | downloaders int
43 |
44 | // Timeout after which low priority request can be served.
45 | idleTimeout time.Duration
46 |
47 | // Internal channels.
48 | uploads chan request
49 | downloads chan request
50 | uploadsPrio chan request
51 | downloadsPrio chan request
52 | }
53 |
54 | // Request is internal structure for wrapping the communication into channels.
55 | type request struct {
56 | key int64
57 | data []byte
58 | offset int64
59 | done chan error
60 | }
61 |
62 | // Return new instance of the proxy which can be directly used. It immediately
63 | // spawns go routines for upload and download workers.
64 | func New(storeInstance ObjectUploadDownloaderAt, uploaders, downloaders int,
65 | idleTimeout time.Duration) ObjectProxy {
66 |
67 | uploads := make(chan request)
68 | downloads := make(chan request)
69 | uploadsPrio := make(chan request)
70 | downloadsPrio := make(chan request)
71 |
72 | s := ObjectProxy{
73 | Instance: storeInstance,
74 | uploaders: uploaders,
75 | downloaders: downloaders,
76 | idleTimeout: idleTimeout,
77 | uploads: uploads,
78 | downloads: downloads,
79 | uploadsPrio: uploadsPrio,
80 | downloadsPrio: downloadsPrio,
81 | }
82 |
83 | for i := 0; i < s.uploaders; i++ {
84 | go s.uploadWorker()
85 | }
86 |
87 | for i := 0; i < s.downloaders; i++ {
88 | go s.downloadWorker()
89 | }
90 |
91 | return s
92 | }
93 |
94 | // Proxy function for uploading the object with key. It selects the right
95 | // channel according to prio and waits for reply.
96 | func (p *ObjectProxy) Upload(key int64, body []byte, prio bool) error {
97 | c := p.uploads
98 | if prio {
99 | c = p.uploadsPrio
100 | }
101 |
102 | done := make(chan error)
103 | c <- request{key: key, data: body, done: done}
104 | return <-done
105 | }
106 |
107 | // Proxy function for downloading the object with key. It selects the right
108 | // channel according to prio and waits for reply.
109 | func (p *ObjectProxy) Download(key int64, chunk []byte, offset int64, prio bool) error {
110 | c := p.downloads
111 | if prio {
112 | c = p.downloadsPrio
113 | }
114 |
115 | done := make(chan error)
116 | c <- request{key, chunk, offset, done}
117 | return <-done
118 | }
119 |
120 | // Generic function for prioritization used by both, uploader and downloader workers.
121 | func (p *ObjectProxy) receiveRequest(prio chan request, normal chan request) request {
122 | var r request
123 |
124 | select {
125 | case r = <-prio:
126 | //case <-time.NewTicker(p.idleTimeout).C:
127 | default:
128 | select {
129 | case r = <-prio:
130 | case r = <-normal:
131 | }
132 | }
133 |
134 | return r
135 | }
136 |
137 | // Upload worker just calls Upload() on the instance provided in New().
138 | func (p *ObjectProxy) uploadWorker() {
139 | for {
140 | r := p.receiveRequest(p.uploadsPrio, p.uploads)
141 | err := p.Instance.Upload(r.key, r.data)
142 | r.done <- err
143 | }
144 | }
145 |
146 | // Upload worker just calls Download() on the instance provided in New().
147 | func (p *ObjectProxy) downloadWorker() {
148 | for {
149 | r := p.receiveRequest(p.downloadsPrio, p.downloads)
150 | err := p.Instance.DownloadAt(r.key, r.data, r.offset)
151 | r.done <- err
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/internal/bs3/objproxy/s3/s3.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Package s3 implements wrapping functions to satisfy ObjectUploadDownloaderAt
4 | // interface. It uses aws api v1.
5 | package s3
6 |
7 | import (
8 | "bytes"
9 | "fmt"
10 | "net"
11 | "net/http"
12 | "time"
13 |
14 | "github.com/aws/aws-sdk-go/aws"
15 | "github.com/aws/aws-sdk-go/aws/credentials"
16 | "github.com/aws/aws-sdk-go/aws/request"
17 | "github.com/aws/aws-sdk-go/aws/session"
18 | "github.com/aws/aws-sdk-go/service/s3"
19 | "github.com/aws/aws-sdk-go/service/s3/s3manager"
20 | "golang.org/x/net/http2"
21 | )
22 |
23 | const (
24 | // Format string for the object key. There should be no need to set
25 | // this differently, hence the constant. If you want to change it, keep
26 | // in mind that we rely on the continuous space of keys for prefix
27 | // consistecy as well as in the GC process.
28 | //
29 | // Furthermore we split the key into halves and use the lower half of
30 | // bits as s3 prefix and upper half for the object key. This is to
31 | // prevent s3 rate limiting which is applied to objects with the same
32 | // prefix.
33 | keyFmt = "%08x/%08x"
34 | )
35 |
36 | // Implementation of ObjectUploadDownloaderAt using AWS S3 as a backend.
37 | // Parameters of http connection are carefully tuned for the best performance
38 | // in the AWS environment.
39 | type S3 struct {
40 | uploader *s3manager.Uploader
41 | downloader *s3manager.Downloader
42 | client *s3.S3
43 | bucket string
44 | }
45 |
46 | // Options to use in New() function due to high number of parameters. There is
47 | // lower chance of ordering mistake with named parameters.
48 | type Options struct {
49 | Remote string
50 | Region string
51 | Bucket string
52 | AccessKey string
53 | SecretKey string
54 | PartSize int64
55 | }
56 |
57 | // Helper struct used for tuning the http connection.
58 | type httpClientSettings struct {
59 | connect time.Duration
60 | connKeepAlive time.Duration
61 | expectContinue time.Duration
62 | idleConn time.Duration
63 | maxAllIdleConns int
64 | maxHostIdleConns int
65 | responseHeader time.Duration
66 | tlsHandshake time.Duration
67 | }
68 |
69 | // Returns http client with configured parameters and added https2 support.
70 | func newHTTPClientWithSettings(httpSettings httpClientSettings) *http.Client {
71 | tr := &http.Transport{
72 | ResponseHeaderTimeout: httpSettings.responseHeader,
73 | Proxy: http.ProxyFromEnvironment,
74 | DialContext: (&net.Dialer{
75 | KeepAlive: httpSettings.connKeepAlive,
76 | DualStack: true,
77 | Timeout: httpSettings.connect,
78 | }).DialContext,
79 | MaxIdleConns: httpSettings.maxAllIdleConns,
80 | IdleConnTimeout: httpSettings.idleConn,
81 | TLSHandshakeTimeout: httpSettings.tlsHandshake,
82 | MaxIdleConnsPerHost: httpSettings.maxHostIdleConns,
83 | ExpectContinueTimeout: httpSettings.expectContinue,
84 | }
85 |
86 | http2.ConfigureTransport(tr)
87 |
88 | return &http.Client{
89 | Transport: tr,
90 | }
91 | }
92 |
93 | // Upload function implemented through s3 api.
94 | func (s *S3) Upload(key int64, buf []byte) error {
95 | _, err := s.uploader.Upload(&s3manager.UploadInput{
96 | Bucket: aws.String(s.bucket),
97 | Key: aws.String(encode(key)),
98 | Body: bytes.NewReader(buf),
99 | })
100 |
101 | return err
102 | }
103 |
104 | // GetObjectSize function implemented through s3 api.
105 | func (s *S3) GetObjectSize(key int64) (int64, error) {
106 | head, err := s.client.HeadObject(&s3.HeadObjectInput{
107 | Bucket: aws.String(s.bucket),
108 | Key: aws.String(encode(key)),
109 | })
110 |
111 | var size int64
112 | if err == nil {
113 | size = *head.ContentLength
114 | }
115 |
116 | return size, err
117 | }
118 |
119 | // DownloadAt function implemented through s3 api.
120 | func (s *S3) DownloadAt(key int64, buf []byte, offset int64) error {
121 | to := offset + int64(len(buf)) - 1
122 | rng := fmt.Sprintf("bytes=%d-%d", offset, to)
123 | b := aws.NewWriteAtBuffer(buf)
124 |
125 | _, err := s.downloader.Download(b, &s3.GetObjectInput{
126 | Bucket: aws.String(s.bucket),
127 | Key: aws.String(encode(key)),
128 | Range: &rng,
129 | })
130 |
131 | return err
132 | }
133 |
134 | // Delete function implemented through s3 api.
135 | func (s *S3) Delete(key int64) error {
136 | _, err := s.client.DeleteObject(&s3.DeleteObjectInput{
137 | Bucket: aws.String(s.bucket),
138 | Key: aws.String(encode(key)),
139 | })
140 |
141 | return err
142 | }
143 |
144 | func New(o Options) (*S3, error) {
145 | s := new(S3)
146 | s.bucket = o.Bucket
147 |
148 | // For the best possible performance (throughput close to 10GB/s) it
149 | // should be tuned according to the object backend.
150 | // Following settings are recommended by AWS for usage in their
151 | // network.
152 | httpClient := newHTTPClientWithSettings(httpClientSettings{
153 | connect: 5 * time.Second,
154 | expectContinue: 1 * time.Second,
155 | idleConn: 90 * time.Second,
156 | connKeepAlive: 30 * time.Second,
157 | maxAllIdleConns: 100,
158 | maxHostIdleConns: 10,
159 | responseHeader: 5 * time.Second,
160 | tlsHandshake: 5 * time.Second,
161 | })
162 |
163 | sess, err := session.NewSession(&aws.Config{
164 | Endpoint: aws.String(o.Remote),
165 | Region: aws.String(o.Region),
166 | Credentials: credentials.NewStaticCredentials(o.AccessKey, o.SecretKey, ""),
167 | S3ForcePathStyle: aws.Bool(true),
168 | S3DisableContentMD5Validation: aws.Bool(true),
169 | HTTPClient: httpClient,
170 | })
171 |
172 | if err != nil {
173 | return nil, err
174 | }
175 |
176 | s.client = s3.New(sess)
177 | s.uploader = s3manager.NewUploader(sess)
178 | s.downloader = s3manager.NewDownloader(sess)
179 |
180 | // Limiting the concurency of s3 library. We do not benefit from
181 | // multipart uploads/downloads because we have small objects. The only
182 | // exception is downloading/uploading the extent map during initial
183 | // recover or final map upload. This should be tuned if your map is
184 | // huge (= huge device) and you have fast network and don't want to
185 | // wait.
186 | s.uploader.Concurrency = 1
187 | s3manager.WithUploaderRequestOptions(request.Option(func(r *request.Request) {
188 | r.HTTPRequest.Header.Add("X-Amz-Content-Sha256", "UNSIGNED-PAYLOAD")
189 | }))(s.uploader)
190 | s.downloader.Concurrency = 1
191 |
192 | err = s.makeBucketExist()
193 |
194 | return s, err
195 | }
196 |
197 | // Check whether bucket exist and if not, create it and wait until it appears.
198 | func (s *S3) makeBucketExist() error {
199 | _, err := s.client.HeadBucket(&s3.HeadBucketInput{Bucket: aws.String(s.bucket)})
200 |
201 | if err != nil {
202 | _, err = s.client.CreateBucket(&s3.CreateBucketInput{
203 | Bucket: aws.String(s.bucket)})
204 |
205 | if err == nil {
206 | err = s.client.WaitUntilBucketExists(&s3.HeadBucketInput{
207 | Bucket: aws.String(s.bucket)})
208 | }
209 | }
210 |
211 | return err
212 | }
213 |
214 | // Delete object with key and all objects with higher keys.
215 | func (s *S3) DeleteKeyAndSuccessors(fromKey int64) error {
216 | err := s.client.ListObjectsV2Pages(&s3.ListObjectsV2Input{
217 | Bucket: aws.String(s.bucket),
218 | }, func(page *s3.ListObjectsV2Output, last bool) bool {
219 | for _, o := range page.Contents {
220 | key := decode(*o.Key)
221 | if key >= fromKey {
222 | s.Delete(key)
223 | }
224 | }
225 | return true
226 | })
227 |
228 | return err
229 | }
230 |
231 | // We split the key into halves and use the lower half of bits as s3 prefix and
232 | // upper half for the object key. This is to prevent s3 rate limiting which is
233 | // applied to objects with the same prefix.
234 | func encode(key int64) string {
235 | left := (key >> 32) & 0xffffffff
236 | right := key & 0xffffffff
237 |
238 | return fmt.Sprintf(keyFmt, right, left)
239 | }
240 |
241 | // The inverse to encode()
242 | func decode(keyWithPrefix string) int64 {
243 | var prefix, key int64
244 | fmt.Sscanf(keyWithPrefix, keyFmt, &prefix, &key)
245 |
246 | k := (key << 32) + prefix
247 |
248 | return k
249 | }
250 |
--------------------------------------------------------------------------------
/internal/config/config.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Package config is a singleton and provides global access to the
4 | // configuration values.
5 | package config
6 |
7 | import (
8 | "flag"
9 | "os"
10 |
11 | "github.com/ilyakaznacheev/cleanenv"
12 | )
13 |
14 | const (
15 | // Default config path. It does not need to exist, default values for all parameters will be
16 | // used instead.
17 | defaultConfig = "/etc/bs3/config.toml"
18 | )
19 |
20 | var Cfg Config
21 |
22 | // Configuration structure for the program. We use toml format for file-based
23 | // configuration and also all configuration options can be overriden by
24 | // environment variable specified in this structure.
25 | type Config struct {
26 | ConfigPath string
27 |
28 | Null bool `toml:"null" env:"BS3_NULL" env-default:"false" env-description:"Use null backend, i.e. immediate acknowledge to read or write. For testing BUSE raw performance."`
29 | Major int `toml:"major" env:"BS3_MAJOR" env-default:"0" env-description:"Device major. Decimal part of /dev/buse%d."`
30 | Threads int `toml:"threads" env:"BS3_THREADS" env-default:"0" env-description:"Number of user-space threads for serving queues."`
31 | CPUsPerNode int `toml:"cpus_per_node" env:"BS3_CPUS_PER_NODE" env-default:"0" env-description:"Number of CPUs per one numa node."`
32 | Size int64 `toml:"size" env:"BS3_SIZE" env-default:"8" env-description:"Device size in GB."`
33 | BlockSize int `toml:"block_size" env:"BS3_BLOCKSIZE" env-default:"4096" env-description:"Block size."`
34 | IOMin int `toml:"io_min" env:"BS3_IO_MIN" env-default:"0" env-description:"Minimal IO."`
35 | IOOpt int `toml:"io_opt" env:"BS3_IO_OPT" env-default:"0" env-description:"Optimal IO."`
36 | Scheduler bool `toml:"scheduler" env:"BS3_SCHEDULER" env-default:"false" env-description:"Use block layer scheduler."`
37 | QueueDepth int `toml:"queue_depth" env:"BS3_QUEUEDEPTH" env-default:"128" env-description:"Device IO queue depth."`
38 |
39 | S3 struct {
40 | Bucket string `toml:"bucket" env:"BS3_S3_BUCKET" env-description:"S3 Bucket name." env-default:"bs3"`
41 | Remote string `toml:"remote" env:"BS3_S3_REMOTE" env-description:"S3 Remote address. Empty string for AWS S3 endpoint." env-default:""`
42 | Region string `toml:"region" env:"BS3_S3_REGION" env-description:"S3 Region." env-default:"us-east-1"`
43 | AccessKey string `toml:"access_key" env:"BS3_S3_ACCESSKEY" env-description:"S3 Access Key." env-default:""`
44 | SecretKey string `toml:"secret_key" env:"BS3_S3_SECRETKEY" env-description:"S3 Secret Key." env-default:""`
45 | Uploaders int `toml:"uploaders" env:"BS3_S3_UPLOADERS" env-description:"S3 Max number of uploader threads." env-default:"16"`
46 | Downloaders int `toml:"downloaders" env:"BS3_S3_DOWNLOADERS" env-description:"S3 Max number of downloader threads." env-default:"16"`
47 | } `toml:"s3"`
48 |
49 | Write struct {
50 | Durable bool `toml:"durable" env:"BS3_WRITE_DURABLE" env-description:"Flush semantics. True means durable, false means barrier only." env-default:"false"`
51 | BufSize int `toml:"shared_buffer_size" env:"BS3_WRITE_BUFSIZE" env-description:"Write shared memory size in MB." env-default:"32"`
52 | ChunkSize int `toml:"chunk_size" env:"BS3_WRITE_CHUNKSIZE" env-description:"Chunk size in MB." env-default:"4"`
53 | CollisionSize int `toml:"collision_chunk_size" env:"BS3_WRITE_COLSIZE" env-description:"Collision size in MB." env-default:"1"`
54 | } `toml:"write"`
55 |
56 | Read struct {
57 | BufSize int `toml:"shared_buffer_size" env:"BS3_READ_BUFSIZE" env-description:"Read shared memory size in MB." env-default:"32"`
58 | } `toml:"read"`
59 |
60 | GC struct {
61 | Step int64 `toml:"step" env:"BS3_GC_STEP" env-description:"Step for traversing the extent map for living extents. In blocks." env-default:"1024"`
62 | LiveData float64 `toml:"live_data" env:"BS3_GC_LIVEDATA" env-description:"Live data ratio threshold for threshold GC. This is for the threshold GC which is triggered by the user or systemd timer." env-default:"0.3"`
63 | IdleTimeoutMs int64 `toml:"idle_timeout" env:"BS3_GC_IDLETIMEOUT" env-description:"Idle timeout for running GC requests. In ms." env-default:"200"`
64 | Wait int64 `toml:"wait" env:"BS3_GC_WAIT" env-description:"How many seconds wait before next dead GC round. This just for cleaning dead objects with minimal performance impact." env-default:"600"`
65 | } `toml:"gc"`
66 |
67 | Log struct {
68 | Level int `toml:"level" env:"BS3_LOG_LEVEL" env-description:"Log level." env-default:"-1"`
69 | Pretty bool `toml:"pretty" env:"BS3_LOG_PRETTY" env-description:"Pretty logging." env-default:"true"`
70 | } `toml:"log"`
71 |
72 | SkipCheckpoint bool `toml:"skip_checkpoint" env:"BS3_SKIP" env-description:"Skip restoring from and creating checkpoint." env-default:"false"`
73 | Profiler bool `toml:"profiler" env:"BS3_PROFILER" env-description:"Enable golang web profiler." env-default:"false"`
74 | ProfilerPort int `toml:"profiler_port" env:"BS3_PROFILER_PORT" env-description:"Port to listen on." env-default:"6060"`
75 | }
76 |
77 | // Configure reads commandline flags and handles the configuration. The
78 | // configuration file has the lower priotiry and the environment variables have
79 | // the highest priority. It is perfetcly to fine to use just one of these or to
80 | // combine them.
81 | func Configure() error {
82 | flagSetup()
83 | err := parse()
84 |
85 | return err
86 | }
87 |
88 | // Parse the configuration file and reads the environment variable. After that
89 | // it does some values postprocessing and fills the Cfg structure.
90 | func parse() error {
91 | if err := cleanenv.ReadConfig(Cfg.ConfigPath, &Cfg); err != nil {
92 | if err := cleanenv.ReadEnv(&Cfg); err != nil {
93 | return err
94 | }
95 | }
96 |
97 | Cfg.Size *= 1024 * 1024 * 1024
98 | Cfg.Write.BufSize *= 1024 * 1024
99 | Cfg.Write.ChunkSize *= 1024 * 1024
100 | Cfg.Write.CollisionSize *= 1024 * 1024
101 | Cfg.Read.BufSize *= 1024 * 1024
102 |
103 | if Cfg.BlockSize != 512 {
104 | Cfg.BlockSize = 4096
105 | }
106 |
107 | if Cfg.IOMin == 0 {
108 | Cfg.IOMin = Cfg.BlockSize
109 | }
110 |
111 | if Cfg.IOOpt == 0 {
112 | Cfg.IOOpt = Cfg.BlockSize
113 | }
114 |
115 | return nil
116 | }
117 |
118 | // Handle program flags.
119 | func flagSetup() {
120 | f := flag.NewFlagSet("bs3", flag.ExitOnError)
121 | f.StringVar(&Cfg.ConfigPath, "c", defaultConfig, "Path to configuration file")
122 | f.Usage = cleanenv.FUsage(f.Output(), &Cfg, nil, f.Usage)
123 | f.Parse(os.Args[1:])
124 | }
125 |
--------------------------------------------------------------------------------
/internal/null/null.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // Null package does nothing but correctly.
4 | package null
5 |
6 | // Null implementation of BuseReadWriter. Usefull for measuring performance of
7 | // underlying BUSE and buse library. Otherwise useless. Is contained in the
8 | // same module to avoid duplication in BUSE code and configuration. It can also
9 | // serve as a template for new BUSE device implementation since it is an
10 | // implementation of BuseReadWriter interface.
11 | type null struct {
12 | }
13 |
14 | func NewNull() *null {
15 | return &null{}
16 | }
17 |
18 | func (n *null) BuseWrite(writes int64, chunk []byte) error {
19 | return nil
20 | }
21 |
22 | func (n *null) BuseRead(sector, length int64, chunk []byte) error {
23 | return nil
24 | }
25 |
26 | func (n *null) BusePreRun() {
27 | }
28 |
29 | func (n *null) BusePostRemove() {
30 | }
31 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2021 Vojtech Aschenbrenner
2 |
3 | // bs3 is a userspace daemon using BUSE for creating a block device and S3
4 | // protocol to communicate with object backend. It is designed for easy
5 | // extension of all the important parts. Hence the S3 protocol can be easily
6 | // replaced by RADOS or any other protocol.
7 | //
8 | // Project structure is following:
9 | //
10 | // - internal contains all packages used by this program. The name "internal"
11 | // is reserved by go compiler and disallows its imports from different
12 | // projects. Since we don't provide any reusable packages, we use internal
13 | // directory.
14 | //
15 | // - internal/bs3 contains all packages related only to the bs3 implementation.
16 | // See the package descriptions in the source code for more details.
17 | //
18 | // - internal/null contains trivial implementation of block device which does
19 | // nothing but correctly. It can be used for benchmarking underlying buse
20 | // library and kernel module. The null implementation is part of bs3 because it
21 | // shares configuration and makes benchmarking easier and without code
22 | // duplication.
23 | //
24 | // - internal/config contains configuration package which is common for both,
25 | // bs3 and null implementations.
26 | package main
27 |
28 | import (
29 | "fmt"
30 | "net/http"
31 | _ "net/http/pprof"
32 | "os"
33 | "os/signal"
34 | "syscall"
35 |
36 | "github.com/rs/zerolog"
37 | "github.com/rs/zerolog/log"
38 |
39 | "github.com/asch/bs3/internal/bs3"
40 | "github.com/asch/bs3/internal/config"
41 | "github.com/asch/bs3/internal/null"
42 | "github.com/asch/buse/lib/go/buse"
43 | )
44 |
45 | // Parse configuration from file and environment variables, creates a
46 | // BuseReadWriter and creates new buse device with it. The device is ran until
47 | // it is signaled by SIGINT or SIGTERM to gracefully finish.
48 | func main() {
49 | err := config.Configure()
50 | if err != nil {
51 | log.Panic().Err(err).Send()
52 | }
53 |
54 | loggerSetup(config.Cfg.Log.Pretty, config.Cfg.Log.Level)
55 |
56 | log.Info().Msgf("Configuration for block device buse%d loaded from %s",
57 | config.Cfg.Major, config.Cfg.ConfigPath)
58 |
59 | if config.Cfg.Profiler {
60 | log.Info().Msg("Running profiler.")
61 | runProfiler(config.Cfg.ProfilerPort)
62 | }
63 |
64 | buseReadWriter, err := getBuseReadWriter(config.Cfg.Null)
65 | if err != nil {
66 | log.Panic().Err(err).Send()
67 | }
68 |
69 | buse, err := buse.New(buseReadWriter, buse.Options{
70 | Durable: config.Cfg.Write.Durable,
71 | WriteChunkSize: int64(config.Cfg.Write.ChunkSize),
72 | BlockSize: int64(config.Cfg.BlockSize),
73 | IOMin: int64(config.Cfg.IOMin),
74 | Threads: int(config.Cfg.Threads),
75 | Major: int64(config.Cfg.Major),
76 | WriteShmSize: int64(config.Cfg.Write.BufSize),
77 | ReadShmSize: int64(config.Cfg.Read.BufSize),
78 | Size: int64(config.Cfg.Size),
79 | CollisionArea: int64(config.Cfg.Write.CollisionSize),
80 | QueueDepth: int64(config.Cfg.QueueDepth),
81 | Scheduler: config.Cfg.Scheduler,
82 | CPUsPerNode: config.Cfg.CPUsPerNode,
83 | })
84 |
85 | if err != nil {
86 | log.Panic().Msg(err.Error())
87 | }
88 | log.Info().Msgf("Block device buse%d registered.", config.Cfg.Major)
89 |
90 | registerSigHandlers(buse)
91 |
92 | buse.Run()
93 | log.Info().Msgf("Block device buse%d stopped.", config.Cfg.Major)
94 |
95 | buse.RemoveDevice()
96 | log.Info().Msgf("Block device buse%d removed.", config.Cfg.Major)
97 | }
98 |
99 | // Return null device if user wants it, otherwise returns bs3 device, which is
100 | // default.
101 | func getBuseReadWriter(wantNullDevice bool) (buse.BuseReadWriter, error) {
102 | if wantNullDevice {
103 | return null.NewNull(), nil
104 | }
105 |
106 | bs3, err := bs3.NewWithDefaults()
107 |
108 | return bs3, err
109 | }
110 |
111 | // Register handler for graceful stop when SIGINT or SIGTERM came in.
112 | func registerSigHandlers(buse buse.Buse) {
113 | stopChan := make(chan os.Signal, 1)
114 | signal.Notify(stopChan, os.Interrupt)
115 | signal.Notify(stopChan, syscall.SIGTERM)
116 | go func() {
117 | <-stopChan
118 | log.Info().Msg("Stopping bs3 device.")
119 | buse.StopDevice()
120 | }()
121 | }
122 |
123 | func loggerSetup(pretty bool, level int) {
124 | if pretty {
125 | log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
126 | }
127 |
128 | zerolog.SetGlobalLevel(zerolog.Level(level))
129 | }
130 |
131 | // Enables remote profiling support. Useful for perfomance debugging.
132 | func runProfiler(port int) {
133 | go func() {
134 | log.Info().Err(http.ListenAndServe(fmt.Sprintf("localhost:%d", port), nil)).Send()
135 | }()
136 | }
137 |
--------------------------------------------------------------------------------