├── .gitignore
├── Makefile
├── README.md
├── man
    ├── scoutfs-corruption.7
    ├── scoutfs.5
    └── scoutfs.8
├── scoutfs-utils.spec.in
├── sparse.sh
├── src
    ├── avl.c
    ├── avl.h
    ├── bitmap.c
    ├── bitmap.h
    ├── bitops.h
    ├── btree.c
    ├── btree.h
    ├── cmd.c
    ├── cmd.h
    ├── cmp.h
    ├── counters.c
    ├── crc.c
    ├── crc.h
    ├── dev.c
    ├── dev.h
    ├── df.c
    ├── endian_swap.h
    ├── format.h
    ├── hash.h
    ├── ino_path.c
    ├── ioctl.h
    ├── key.h
    ├── leaf_item_hash.c
    ├── leaf_item_hash.h
    ├── list.h
    ├── listxattr_hidden.c
    ├── main.c
    ├── mkfs.c
    ├── parse.c
    ├── parse.h
    ├── print.c
    ├── rand.c
    ├── rand.h
    ├── search_xattrs.c
    ├── setattr.c
    ├── sparse.h
    ├── srch.c
    ├── srch.h
    ├── stage_release.c
    ├── stat.c
    ├── util.h
    ├── waiting.c
    └── walk_inodes.c
└── tex
    ├── .gitignore
    ├── Makefile
    ├── scoutfs.tex
    ├── usenix2019.sty
    └── usenix2019.tex


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.d
 3 | *.swp
 4 | src/scoutfs
 5 | .sparse*
 6 | .mock.build*
 7 | cscope.*
 8 | scoutfs-utils.spec
 9 | scoutfs-utils-*.tar
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SCOUTFS_FORMAT_HASH := \
 2 | 	$(shell cat src/format.h src/ioctl.h | md5sum | cut -b1-16)
 3 | 
 4 | CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
 5 | 	-Wpadded \
 6 | 	-fno-strict-aliasing \
 7 | 	-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
 8 | 
 9 | BIN := src/scoutfs
10 | OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
11 | DEPS := $(wildcard */*.d)
12 | 
13 | all: $(BIN)
14 | 
15 | ifneq ($(DEPS),)
16 | -include $(DEPS)
17 | endif
18 | 
19 | ifeq ($(V), )
20 | QU = @echo
21 | VE = @
22 | else
23 | QU = @:
24 | VE = 
25 | endif
26 | 
27 | $(BIN): $(OBJ)
28 | 	$(QU)  [BIN $@]
29 | 	$(VE)gcc -o $@ $^ -luuid -lm -lcrypto
30 | 
31 | %.o %.d: %.c Makefile sparse.sh
32 | 	$(QU)  [CC $<]
33 | 	$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
34 | 	$(QU)  [SP $<]
35 | 	$(VE)./sparse.sh -Wbitwise -D__CHECKER__ $(CFLAGS) $<
36 | 
37 | .PHONY: .FORCE
38 | 
39 | # - We use the git describe from tags to set up the RPM versioning
40 | RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
41 | RPM_GITHASH := $(shell git rev-parse --short HEAD)
42 | 
43 | %.spec: %.spec.in .FORCE
44 | 	sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
45 | 	    -e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
46 | 	mv $@+ $@
47 | 
48 | TARFILE = scoutfs-utils-$(RPM_VERSION).tar
49 | 
50 | dist: $(RPM_DIR) scoutfs-utils.spec
51 | 	git archive --format=tar --prefix scoutfs-utils-$(RPM_VERSION)/ HEAD^{tree} > $(TARFILE)
52 | 	@ tar rf $(TARFILE) --transform="s@\(.*\)@scoutfs-utils-$(RPM_VERSION)/\1@" scoutfs-utils.spec
53 | 
54 | clean:
55 | 	@rm -f $(BIN) $(OBJ) $(DEPS) .sparse.*
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository contains the userspace software for the scoutfs
2 | clustered filesystem.
3 | 
4 | More context and instructions can be found on the https://scoutfs.org/
5 | community site or in the
6 | [scoutfs-kmod-dev](https://github.com/versity/scoutfs-kmod-dev) git
7 | repository which houses the scoutfs Linux kernel module.
8 | 


--------------------------------------------------------------------------------
/man/scoutfs-corruption.7:
--------------------------------------------------------------------------------
  1 | .TH scoutfs-corruption 7
  2 | .SH NAME
  3 | scoutfs-corruption \- corruption message details
  4 | .SH DESCRIPTION
  5 | If scoutfs detects corruption during operation it will output an error
  6 | message describing the corruption.  This document gives details of the
  7 | corruption described by the messages.
  8 | .SH CORRUPTION MESSAGE IDENTIFIERS
  9 | .TP
 10 | .B SC_DIRENT_NAME_LEN
 11 | A directory entry with an invalid name length was found during lookup.
 12 | 
 13 | Directory entries are stored in the values of metadata items.  The item
 14 | value contains a small header and the full entry name.  The length of
 15 | the entry name is calculated by substracting the size of the header from
 16 | the length of the item value.  This corruption is detected if the length
 17 | of the calculated name length is invalid by being less than 1 or greater
 18 | than 255.
 19 | 
 20 | .BR dir_ino " - inode number of directory that contains the item"
 21 | .br
 22 | .BR hash " - hash value of search name"
 23 | .br
 24 | .BR key " - identifies the item with the invalid name length"
 25 | .br
 26 | .BR len " - the invalid calculaged name length"
 27 | .sp
 28 | .TP
 29 | .B SC_DIRENT_READDIR_NAME_LEN
 30 | A directory entry with an invalid name length was found during readdir.
 31 | 
 32 | This corruption is very similar to
 33 | .B SC_DIRENT_NAME_LEN
 34 | except that the corruption is discovered during readdir instead of
 35 | lookup.  The readdir search key is formed from the file position instead
 36 | of from the hashed name as in lookup.   The dirent structure stored in
 37 | the item value is the same.
 38 | 
 39 | .BR dir_ino " - inode number of directory that contains the item"
 40 | .br
 41 | .BR pos " - the file position readdir was searching from"
 42 | .br
 43 | .BR key " - identifies the item with the invalid name length"
 44 | .br
 45 | .BR len " - the invalid calculaged name length"
 46 | .sp
 47 | 
 48 | .TP
 49 | .B SC_DIRENT_BACKREF_NAME_LEN
 50 | A directory entry with an invalid name length was found while finding
 51 | entries that point to an inode.
 52 | 
 53 | This corruption is very similar to
 54 | .B SC_DIRENT_NAME_LEN
 55 | except that the
 56 | corruption is discovered while finding entries that refer to a specific
 57 | inode.  The search key is formed from the inode and position of the
 58 | referring entry instead of from the hashed name as in lookup.   The
 59 | dirent structure stored in the item value is the same.
 60 | 
 61 | .BR ino " - target inode number we're finding entries to"
 62 | .br
 63 | .BR dir_ino " - inode number of directory containing entries to search"
 64 | .br
 65 | .BR pos " - position in directory containing entries to search"
 66 | .br
 67 | .BR key " - identifies the item with the invalid name length"
 68 | .br
 69 | .BR len " - the invalid calculaged name length"
 70 | .sp
 71 | 
 72 | .TP
 73 | .B SC_SYMLINK_INODE_SIZE
 74 | The items that contain a symlink target path weren't found.
 75 | 
 76 | The target path of a symlink is stored in a series of metadata items.
 77 | The number of items can be calculated from the size of the path.  While
 78 | trying to resolve a symlink one of the items wasn't found.
 79 | 
 80 | .BR ino " - inode number of the symlink with the invalid size"
 81 | .br
 82 | .BR size " - the invalid size found in the inode"
 83 | .sp
 84 | 
 85 | .TP
 86 | .B SC_SYMLINK_MISSING_ITEM
 87 | A symlink inode contained an invalid size.
 88 | 
 89 | The i_size field of the inode that stores a symlink records the length
 90 | of the path of the symlink target.  The path length can't be less than 1
 91 | or greater than the max size which is around 4KiB.
 92 | 
 93 | .BR ino " - inode number of the symlink with the invalid size"
 94 | .br
 95 | .BR size " - the length of the target path"
 96 | .sp
 97 | 
 98 | .TP
 99 | .B SC_SYMLINK_NOT_NULL_TERM
100 | A symlink target path wasn't null terminated.
101 | 
102 | The target path stored in a symlink's metadata items wasn't null
103 | terminated.
104 | 
105 | .BR ino " - inode number of the symlink with the invalid size"
106 | .br
107 | .BR last " - the value of the final byte of the path"
108 | .sp
109 | 
110 | .TP
111 | .B SC_BTREE_BLOCK_LEVEL
112 | A btree block's header did not contain the expected level field.
113 | 
114 | The btree root stores the height of the btree and each btree block
115 | stores its level in the tree.  During descent the level is loaded from
116 | the root and decremented as each block is traveresed.  This corruption
117 | occurs when a btree block's level field didn't match the level that was
118 | being calculated during descent.
119 | 
120 | .BR root_height " - height of the tree in the root"
121 | .br
122 | .BR root_blkno " - block number of the first block in the root"
123 | .br
124 | .BR root_seq " - sequence number of the first block in the root"
125 | .br
126 | .BR blkno " - block number of the block with mismatched level"
127 | .br
128 | .BR seq " - sequence number of the block with mismatched level"
129 | .br
130 | .BR level " - level of the block with mismatched level"
131 | .br
132 | .BR expected " - expected level that was calculated during descent"
133 | .sp
134 | 
135 | .TP
136 | .B SC_BTREE_NO_CHILD_REF
137 | A btree parent block didn't have a child item for a key.
138 | 
139 | Each child reference in a parent btree block contains the greatest key
140 | that will be stored in the subtree rooted in the child.  The child
141 | references down the right side of the tree must have a key that is
142 | greater than all possible keys.
143 | 
144 | This corruption occurs during descent when the search key was greater
145 | than the last child reference's key.
146 | 
147 | .BR root_height " - height of the tree in the root"
148 | .br
149 | .BR root_blkno " - block number of the first block in the root"
150 | .br
151 | .BR root_seq " - sequence number of the first block in the root"
152 | .br
153 | .BR blkno " - block number of the block with mismatched level"
154 | .br
155 | .BR seq " - sequence number of the block with mismatched level"
156 | .br
157 | .BR level " - level of the block with mismatched level"
158 | .br
159 | .BR nr " - number of items in the parent block"
160 | .br
161 | .BR pos " - child item index that search found"
162 | .br
163 | .BR cmp " - comparison of search key and found"
164 | .sp
165 | 
166 | .SH AUTHORS
167 | Zach Brown <zab@versity.com>
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/man/scoutfs.5:
--------------------------------------------------------------------------------
 1 | .TH scoutfs 5
 2 | .SH NAME
 3 | scoutfs \- overview and mount options for the scoutfs filesystem
 4 | .SH DESCRIPTION
 5 | A scoutfs filesystem is stored on two block devices.  Multiple mounts of
 6 | the filesystem are supported between hosts that share access to the
 7 | block device.  A new filesystem is created with the
 8 | .B mkfs
 9 | command in the
10 | .BR scoutfs (8)
11 | utility.
12 | .SH MOUNT OPTIONS
13 | The following mount options are supported by scoutfs in addition to the
14 | general mount options described in the
15 | .BR mount (8)
16 | manual page.
17 | .TP
18 | .B metadev_path=<device>
19 | The metadev_path option specifies the path to the block device that
20 | contains the filesystem's metadata.
21 | .sp
22 | This option is required.
23 | .TP
24 | .B server_addr=<ipv4:port>
25 | The server_addr option indicates that this mount will participate in
26 | quorum election to try and run a server for all the mounts of its
27 | filesystem.   The option specifies the local TCP IPv4 address that the
28 | mount's elected server will listen on for connections from all other
29 | mounts of the filesystem.
30 | .sp
31 | The IPv4 address must be specified as a dotted quad, name resolution is
32 | not supported.  A specific port may be provided after a seperating
33 | colon.  If no port is specified then a random port will be chosen.  The
34 | address will be used for the lifetime of the mount and can not be
35 | changed.  The mount must be unmounted to specify a different address.
36 | .sp
37 | If server_addr is not specified then the mount will read the filesystem
38 | until it sees the address of an elected server to connect to.
39 | .SH FURTHER READING
40 | A
41 | .B scoutfs
42 | filesystem can detect corruption at runtime.  A catalog of kernel log
43 | messages that indicate corruption can be found in
44 | .BR scoutfs-corruption (8)
45 | \&.
46 | 
47 | .SH SEE ALSO
48 | .BR scoutfs (8),
49 | .BR scoutfs-corruption (7).
50 | 
51 | .SH AUTHORS
52 | Zach Brown <zab@versity.com>
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/man/scoutfs.8:
--------------------------------------------------------------------------------
  1 | .TH scoutfs 8
  2 | .SH NAME
  3 | scoutfs \- scoutfs management utility
  4 | .SH DESCRIPTION
  5 | The
  6 | .b
  7 | scoutfs
  8 | utility provides commands to manage a scoutfs filesystem.
  9 | .SH COMMANDS
 10 | .TP
 11 | .BI "counters [\-t\] <sysfs topdir>"
 12 | .sp
 13 | Displays the counters and their values for a mounted scoutfs filesystem.
 14 | Each counter and its value are printed on a line to stdout with
 15 | sufficient spaces seperating the name and value to align the values
 16 | after
 17 | .RS 1.0i
 18 | .PD 0
 19 | .TP
 20 | .sp
 21 | .B "\-t"
 22 | Format the counters into a table that fills the display instead of
 23 | printing one counter per line.  The names and values are padded to
 24 | create columns that fill the current width of the terminal.
 25 | .TP
 26 | .B "sysfs topdir"
 27 | Specify the mount's sysfs directory in which to find the
 28 | .B counters/
 29 | directory when then contains files for each counter.
 30 | The sysfs directory is typically
 31 | of the form
 32 | .I /sys/fs/scoutfs/f.<fsid>.r.<rid>/
 33 | \&.
 34 | .RE
 35 | .PD
 36 | 
 37 | .TP
 38 | .BI "data-waiting <ino> <iblock> <path>"
 39 | .sp
 40 | Displays all the files and blocks for which there is a task blocked waiting on
 41 | offline data.
 42 | .sp
 43 | The results are sorted by the file's inode number and the
 44 | logical block offset that is being waited on.
 45 | .sp
 46 | Each line of output specifies a block in a file that has a task waiting
 47 | and is formatted as:
 48 | .I "ino <nr> iblock <nr> ops [str]"
 49 | \&. The ops string indicates blocked operations seperated by commas and can
 50 | include 
 51 | .B read
 52 | for a read operation,
 53 | .B write
 54 | for a write operation, and
 55 | .B change_size
 56 | for a truncate or extending write.
 57 | .RS 1.0i
 58 | .PD 0
 59 | .sp
 60 | .TP
 61 | .B "ino"
 62 | Start iterating over waiting tasks from the given inode number.
 63 | Specifying 0 will show all waiting tasks.
 64 | .TP
 65 | .B "iblock"
 66 | Start iterating over waiting tasks from the given logical block number
 67 | in the starting inode.  Specifying 0 will show blocks in the first inode
 68 | and then continue to show all blocks with tasks waiting in all the
 69 | remaining inodes.
 70 | .TP
 71 | .B "path"
 72 | A path to any inode in the target filesystem, typically the root
 73 | directory.
 74 | .RE
 75 | .PD
 76 | 
 77 | .TP
 78 | .BI "find-xattrs <\-n\ name> <\-f path>"
 79 | .sp
 80 | Displays the inode numbers of inodes in the filesystem which may have
 81 | an extended attribute with the given name.
 82 | .sp
 83 | The results may contain false positives.  The returned inode numbers
 84 | should be checked to verify that the extended attribute is in fact
 85 | present on the inode.
 86 | .RS 1.0i
 87 | .PD 0
 88 | .TP
 89 | .sp
 90 | .B "-n name"
 91 | Specifies the full name of the extended attribute to search for as
 92 | described in the
 93 | .BR xattr (7)
 94 | manual page.
 95 | .TP
 96 | .B "-f path"
 97 | Specifies the path to any inode in the filesystem to search. 
 98 | .RE
 99 | .PD
100 | 
101 | .TP
102 | .BI "ino-path <ino> <path>"
103 | .sp
104 | Displays all the paths to links to the given inode number.
105 | .sp
106 | All the relative paths from the root directory to each link of the
107 | target inode are output, one result per line.  Each output path is
108 | guaranteed to have been a valid path to a link at some point in the
109 | past.  An individual path won't be corrupted by a rename that occurs
110 | during the search.  The set of paths can be modified while the search is
111 | running.  A rename of a parent directory of all the paths, for example,
112 | can result in output where the parent directory name component changes
113 | in the middle of outputting all the paths.
114 | .RS 1.0i
115 | .PD 0
116 | .sp
117 | .TP
118 | .B "ino"
119 | The inode number of the target inode to resolve.
120 | .TP
121 | .B "path"
122 | A path to any inode in the target filesystem, typically the root
123 | directory.
124 | .RE
125 | .PD
126 | 
127 | .TP
128 | .BI "listxattr-hidden <\-f path>"
129 | .sp
130 | Displays all the extended attributes starting with the
131 | .BR scoutfs.
132 | prefix and which contain the
133 | .BR hide.
134 | tag
135 | which makes them invisible to 
136 | .BR listxattr (2)
137 | \&.
138 | The names of each attribute are output, one name per line.  Their order
139 | is determined by internal indexing implementation details and should not
140 | be relied on.
141 | .RS 1.0i
142 | .PD 0
143 | .TP
144 | .sp
145 | .B "-f path"
146 | The path to the file whose extended attributes will be listed.  The
147 | user must have read permission to the inode.
148 | .RE
149 | .PD
150 | 
151 | .TP
152 | .BI "mkfs <\-Q nr> <meta_dev_path> <data_dev_path> [-M meta_size] [-D data_size]"
153 | .sp
154 | Initialize a new empty filesystem in the target devices by writing empty
155 | structures and a new superblock. Since ScoutFS uses separate block
156 | devices for its metadata and data storage, both must be given.
157 | .sp
158 | This 
159 | .B unconditionally destroys
160 | the contents of the devices, regardless of what they contain or who may be
161 | using them.  It simply writes new data structures into known offsets.
162 | .B Be very careful that the devices do not contain data and are not actively in use.
163 | .RS 1.0i
164 | .PD 0
165 | .TP
166 | .sp
167 | .B "-Q nr"
168 | Specify the number of mounts needed to reach quorum and elect a mount
169 | to start the server.  Mounts of the filesystem will hang until this many
170 | mounts are operational and can elect a server amongst themselves.
171 | .sp
172 | Mounts with the 
173 | .B server_addr
174 | mount option participate in quorum.  The safest quorum number is the
175 | smallest majority of an odd number of participating mounts.  For
176 | example,
177 | two out of three total mounts.  This ensures that there can only be one
178 | set of mounts that can establish quorum.
179 | .sp
180 | Degenerate quorums are possible, for example by specifying half of an
181 | even number of mounts or less than half of the mount count, down to even
182 | just one mount establishing quorum. These minority quorums carry the
183 | risk of multiple quorums being established concurrently.  Each quorum's
184 | elected servers race to fence each other and can have the unlikely
185 | outcome of continually racing to fence each other resulting in a
186 | persistent loss of service.
187 | .TP
188 | .B "meta_dev_path"
189 | The path to the device to be used for ScoutFS metadata.  If possible,
190 | use a faster block device for the metadata device.  Its contents will be
191 | unconditionally destroyed.
192 | .TP
193 | .B "data_dev_path"
194 | The path to the device to be used for ScoutFS file data.  If possible,
195 | use a larger block device for the data device.  Its contents will be
196 | unconditionally destroyed.
197 | .TP
198 | .B "-M meta_size"
199 | Limit the space used by the filesystem on the metadata device to the
200 | given size, rather than using the entire block device. Size is given as
201 | an integer followed by a units digit: "K", "M", "G", "T", "P", to denote
202 | kibibytes, mebibytes, etc.
203 | .TP
204 | .B "-D data_size"
205 | Same as previous, but for limiting the size of the data device.
206 | .RE
207 | .PD
208 | 
209 | .TP
210 | .BI "print <path>"
211 | .sp
212 | Prints out all of the metadata in the file system.  This makes no effort
213 | to ensure that the structures are consistent as they're traversed and
214 | can present structures that seem corrupt as they change as they're
215 | output.
216 | .RS 1.0i
217 | .PD 0
218 | .TP
219 | .sp
220 | .B "path"
221 | The path to the metadata device for filesystem whose metadata will
222 | be printed.  The command reads from the buffer cache of the device which
223 | may not reflect the current blocks in the filesystem that may have been
224 | written through another host or device.  The local device's cache can be
225 | manually flushed before printing, perhaps with the
226 | .B \--flushbufs
227 | command in the
228 | .BR blockdev (8)
229 | command.
230 | .RE
231 | .PD
232 | 
233 | .TP
234 | .BI "release <path> <vers> <4KB block offset> <4KB block count>"
235 | .sp
236 | .B Release
237 | the given logical block region of the file.  That is, truncate away
238 | any data blocks but leave behind offline data regions and do not change
239 | the main inode metadata.  Future attempts to read or write the block
240 | region
241 | will block until the region is restored by a 
242 | .B stage
243 | write.  This is used by userspace archive managers to store file data
244 | in a remote archive tier.
245 | .sp
246 | This only works on regular files and with write permission.  Releasing
247 | regions that are already offline or are sparse, including past the end
248 | of the file, silently succeed.
249 | .RS 1.0i
250 | .PD 0
251 | .TP
252 | .sp
253 | .B "path"
254 | The path to the regular file whose region will be released.
255 | .TP
256 | .B "version"
257 | The current data version of the contents of the file.  This ensures
258 | that a release operation is truncating the version of the data that it
259 | expects.  It can't throw away data that was newly written while it was
260 | performing its release operation.  An inode's data_version is read
261 | by the SCOUTFS_IOC_STATFS_MORE
262 | ioctl.
263 | .TP
264 | .B "4KB block offset"
265 | The 64bit logical block offset of the start of the region in units of 4KB.
266 | .TP
267 | .B "4KB block count"
268 | The 64bit length of the region to release in units of 4KB blocks.
269 | .RE
270 | .PD
271 | 
272 | .TP
273 | .BI "setattr <\-c ctime> <\-d data_version> -o <\-s i_size> <\-f path>
274 | .sp
275 | Set scoutfs specific metadata on a newly created inode without updating
276 | other inode metadata.
277 | .RS 1.0i
278 | .PD 0
279 | .TP
280 | .sp
281 | .B "-c ctime"
282 | Specify the inode's creation GMT timespec with 64bit seconds and 32bit
283 | nanoseconds formatted as 
284 | .B sec.nsec
285 | \&.
286 | .TP
287 | .B "-d data_version"
288 | Specify the inode's data version.  This can only be set on regular files whose
289 | current data_version is 0.
290 | .TP
291 | .B "-o"
292 | Create an offline region for all of the file's data up to the specified
293 | file size.  This can only be set on regular files whose data_version is
294 | 0 and i_size must also be specified.
295 | .TP
296 | .B "-s i_size"
297 | Set the inode's i_size.  This can only be set on regular files whose
298 | data_version is 0.
299 | .TP
300 | .B "-f path"
301 | The file whose metadata will be set.
302 | .RE
303 | .PD
304 | 
305 | .TP
306 | .BI "stage <file> <vers> <offset> <count> <archive file>"
307 | .sp
308 | .B Stage
309 | the contents of the file by reading a region of another archive file and writing it
310 | into the file region without updating regular inode metadata.  Any tasks
311 | that are blocked by the offline region will proceed once it has been
312 | staged.
313 | .RS 1.0i
314 | .PD 0
315 | .TP
316 | .sp
317 | .B "file"
318 | The regular file whose contents will be staged.
319 | .TP
320 | .B "vers"
321 | The data_version of the contents to be staged.  It must match the
322 | current data_version of the file.
323 | .TP
324 | .B "offset"
325 | The starting byte offset of the region to write.  This must be aligned
326 | to 4KB blocks.
327 | .TP
328 | .B "count"
329 | The length of the region to write in bytes.  A length of 0 is a noop
330 | and will immediately return success.  The length must be a multiple
331 | of 4KB blocks unless it is writing the final partial block in which
332 | case it must end at i_size.
333 | .TP
334 | .B "archive file"
335 | A file whose contents will be read and written as the staged region.
336 | The start of the archive file will be used as the start of the region.
337 | .RE
338 | .PD
339 | 
340 | .TP
341 | .BI "stat [-s single] <path>"
342 | .sp
343 | Display scoutfs metadata fields for the given inode.
344 | .RS 1.0i
345 | .PD 0
346 | .TP
347 | .sp
348 | .B "-s single"
349 | Only ontput a single stat instead of all the stats with one stat per
350 | line.  The possible stat names are those given in the output.
351 | .TP
352 | .B "path"
353 | The path to the file whose inode field will be output.
354 | .sp
355 | .TP
356 | .RE
357 | .PD
358 | The fields are as follows:
359 | .RS 1.0i
360 | .PD 0
361 | .TP
362 | .B "meta_seq"
363 | The metadata change sequence.  This changes each time the inode's metadata
364 | is changed during a mount's transaction.
365 | .TP
366 | .B "data_seq"
367 | The data change sequence.  This changes each time the inode's data
368 | is changed during a mount's transaction.
369 | .TP
370 | .B "data_version"
371 | The data version changes every time any contents of the file changes,
372 | including size changes.  It can change many times during a syscall in a
373 | transactions.
374 | .TP
375 | .B "online_blocks"
376 | The number of 4Kb data blocks that contain data and can be read.
377 | .TP
378 | .B "online_blocks"
379 | The number of 4Kb data blocks that are offline and would need to be
380 | staged to be read.
381 | .RE
382 | .PD
383 | 
384 | .TP
385 | .BI "statfs [-s single] <path>"
386 | .sp
387 | Display scoutfs metadata fields for a scoutfs filesystem.
388 | .RS 1.0i
389 | .PD 0
390 | .TP
391 | .sp
392 | .B "-s single"
393 | Only ontput a single stat instead of all the stats with one stat per
394 | line.  The possible stat names are those given in the output.
395 | .TP
396 | .B "path"
397 | The path to any inode in the filesystem.
398 | .sp
399 | .TP
400 | .RE
401 | .PD
402 | The fields are as follows:
403 | .RS 1.0i
404 | .PD 0
405 | .TP
406 | .B "fsid"
407 | The unique 64bit filesystem identifier for this filesystem.
408 | .TP
409 | .B "rid"
410 | The unique 64bit random identifier for this mount of the filesystem.
411 | This is generated for every new mount of the file system.
412 | .RE
413 | .PD
414 | 
415 | .TP
416 | .BI "walk-inodes <index> <first> <last> <path>"
417 | .sp
418 | Walks an inode index in the file system and outputs the inode numbers
419 | that are found within the first and last positions in the index.
420 | .RS 1.0i
421 | .PD 0
422 | .sp
423 | .TP
424 | .B "index"
425 | Specifies the index to walk.  The currently supported indices are
426 | .B meta_seq
427 | and
428 | .B data_seq
429 | \&.
430 | .TP
431 | .B "first"
432 | The starting position of the index walk.
433 | .I 0
434 | is the first possible position in every index.
435 | .TP
436 | .B "last"
437 | The last position to include in the index walk.
438 | .I \-1
439 | can be given as shorthand for the U64_MAX last possible position in
440 | every index.
441 | .TP
442 | .B "path"
443 | A path to any inode in the filesystem, typically the root directory.
444 | .RE
445 | .PD
446 | 
447 | .SH SEE ALSO
448 | .BR scoutfs (5),
449 | .BR xattr (7).
450 | 
451 | .SH AUTHORS
452 | Zach Brown <zab@versity.com>
453 | 


--------------------------------------------------------------------------------
/scoutfs-utils.spec.in:
--------------------------------------------------------------------------------
 1 | %define pkg_version @@VERSION@@
 2 | %define pkg_git_hash @@GITHASH@@
 3 | %define pkg_date %(date +%%Y%%m%%d)
 4 | 
 5 | %{!?_release: %global _release 0.%{pkg_date}git%{pkg_git_hash}}
 6 | 
 7 | Name:           scoutfs-utils
 8 | Summary:        scoutfs user space utilities
 9 | Version:        %{pkg_version}
10 | Release:        %{_release}%{?dist}
11 | License:        GPLv2
12 | Group:          System Environment/Base
13 | URL:            http://scoutfs.org/
14 | 
15 | BuildRequires:  git
16 | BuildRequires:  gzip
17 | BuildRequires:  libuuid-devel
18 | BuildRequires:  openssl-devel
19 | 
20 | #Requires:	kmod-scoutfs = %{version}
21 | 
22 | Source:		scoutfs-utils-%{pkg_version}.tar
23 | 
24 | # Disable the building of the debug package(s).
25 | %define debug_package %{nil}
26 | 
27 | %description
28 | scoutfs - user space utilities
29 | 
30 | %package -n scoutfs-devel
31 | Summary:        scoutfs devel headers
32 | Version:        %{pkg_version}
33 | Release:        %{_release}%{?dist}
34 | License:        GPLv2
35 | Group:          Development/Libraries
36 | URL:            http://scoutfs.org/
37 | 
38 | %description -n scoutfs-devel
39 | scoutfs - development headers
40 | 
41 | %prep
42 | %setup -q -n scoutfs-utils-%{pkg_version}
43 | 
44 | %build
45 | make
46 | gzip man/*.?
47 | 
48 | %install
49 | mkdir -p $RPM_BUILD_ROOT%{_mandir}/man{5,7,8}
50 | cp man/*.5.gz $RPM_BUILD_ROOT%{_mandir}/man5/.
51 | cp man/*.7.gz $RPM_BUILD_ROOT%{_mandir}/man7/.
52 | cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
53 | install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
54 | install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
55 | install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
56 | 
57 | %files
58 | %defattr(644,root,root,755)
59 | %{_mandir}/man*/scoutfs*.gz
60 | %defattr(755,root,root,755)
61 | %{_sbindir}/scoutfs
62 | 
63 | %files -n scoutfs-devel
64 | %defattr(644,root,root,755)
65 | %{_includedir}/scoutfs
66 | 
67 | %clean
68 | rm -rf %{buildroot}
69 | 
70 | 


--------------------------------------------------------------------------------
/sparse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # can we find sparse?  If not, we're done.
 4 | which sparse > /dev/null 2>&1 || exit 0
 5 | 
 6 | # 
 7 | # one of the problems with using sparse in userspace is that it picks up
 8 | # things in system headers that we don't care about.  We're willing to
 9 | # take on the burden of filtering them out so that we can have it tell
10 | # us about problems in our code.
11 | #
12 | # system headers using __transparent_union__
13 | RE="^/.*error: ignoring attribute __transparent_union__"
14 | 
15 | # we don't care if system headers have gcc attributes sparse doesn't
16 | # know about
17 | RE="$RE|error: attribute '__leaf__': unknown attribute"
18 | 
19 | # yes, sparse, that's the size of memseting a 4 meg buffer all right
20 | RE="$RE|warning: memset with byte count of 4194304"
21 | 
22 | # some sparse versions don't know about some builtins
23 | RE="$RE|error: undefined identifier '__builtin_fpclassify'"
24 | 
25 | #
26 | # don't filter out 'too many errors' here, it can signify that
27 | # sparse doesn't understand something and is throwing a *ton*
28 | # of useless errors before giving up and existing.  Check
29 | # unfiltered sparse output.
30 | #
31 | 
32 | #
33 | # I'm not sure this is needed.
34 | #
35 | search=$(gcc -print-search-dirs | awk '($1 == "install:"){print "-I" $2}')
36 | 
37 | #
38 | # We're trying to use sparse against glibc headers which go wild trying to
39 | # use internal compiler macros to test features.  We copy gcc's and give
40 | # them to sparse.  But not __SIZE_TYPE__ 'cause sparse defines that one.
41 | #
42 | defines=".sparse.gcc-defines.h"
43 | gcc -dM -E -x c - < /dev/null | grep -v __SIZE_TYPE__ > $defines
44 | include="-include $defines"
45 | 
46 | #
47 | # sparse doesn't seem to notice when it's on a 64bit host.  It warns that
48 | # 64bit values don't fit in 'unsigned long' without this.
49 | #
50 | if grep -q "__LP64__ 1" $defines; then
51 | 	m64="-m64"
52 | else
53 | 	m64=""
54 | fi
55 | 
56 | sparse $m64 $include $search/include "$@" 2>&1 | egrep -v "($RE)" | tee .sparse.output
57 | if  [ -s .sparse.output ]; then
58 | 	exit 1
59 | else
60 | 	exit 0
61 | fi
62 | 


--------------------------------------------------------------------------------
/src/avl.c:
--------------------------------------------------------------------------------
 1 | #include "sparse.h"
 2 | #include "util.h"
 3 | #include "format.h"
 4 | #include "avl.h"
 5 | 
 6 | static struct scoutfs_avl_node *node_ptr(struct scoutfs_avl_root *root,
 7 | 
 8 | 					 __le16 off)
 9 | {
10 | 	return off ? (void *)root + le16_to_cpu(off) : NULL;
11 | }
12 | 
13 | __le16 avl_node_off(struct scoutfs_avl_root *root,
14 | 		    struct scoutfs_avl_node *node)
15 | {
16 | 	if (!node)
17 | 		return 0;
18 | 	return cpu_to_le16((void *)node - (void *)root);
19 | }
20 | 
21 | struct scoutfs_avl_node *avl_first(struct scoutfs_avl_root *root)
22 | {
23 | 	struct scoutfs_avl_node *node = node_ptr(root, root->node);
24 | 
25 | 	while (node && node->left)
26 | 		node = node_ptr(root, node->left);
27 | 
28 | 	return node;
29 | }
30 | 
31 | struct scoutfs_avl_node *avl_next(struct scoutfs_avl_root *root,
32 | 				  struct scoutfs_avl_node *node)
33 | {
34 | 	struct scoutfs_avl_node *parent;
35 | 
36 | 	if (node->right) {
37 | 		node = node_ptr(root, node->right);
38 | 		while (node->left)
39 | 			node = node_ptr(root, node->left);
40 | 		return node;
41 | 	}
42 | 
43 | 	while ((parent = node_ptr(root, node->parent)) &&
44 | 	       node == node_ptr(root, parent->right))
45 | 		node = parent;
46 | 
47 | 	return parent;
48 | }
49 | 


--------------------------------------------------------------------------------
/src/avl.h:
--------------------------------------------------------------------------------
 1 | #ifndef _AVL_H_
 2 | #define _AVL_H_
 3 | 
 4 | __le16 avl_node_off(struct scoutfs_avl_root *root,
 5 | 		    struct scoutfs_avl_node *node);
 6 | struct scoutfs_avl_node *avl_first(struct scoutfs_avl_root *root);
 7 | struct scoutfs_avl_node *avl_next(struct scoutfs_avl_root *root,
 8 | 				  struct scoutfs_avl_node *node);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/bitmap.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE
 2 | #include <unistd.h>
 3 | #include <strings.h>
 4 | 
 5 | #include "sparse.h"
 6 | #include "util.h"
 7 | #include "bitmap.h"
 8 | 
 9 | /*
10 |  * Just a quick simple native bitmap.
11 |  */
12 | 
13 | void set_bit(unsigned long *bits, u64 nr)
14 | {
15 | 	bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
16 | }
17 | 
18 | void clear_bit(unsigned long *bits, u64 nr)
19 | {
20 | 	bits[nr / BITS_PER_LONG] &= ~(1UL << (nr & (BITS_PER_LONG - 1)));
21 | }
22 | 
23 | u64 find_next_set_bit(unsigned long *map, u64 from, u64 total)
24 | {
25 | 	unsigned long bits;
26 | 	u64 base;
27 | 	u64 nr;
28 | 	int bit;
29 | 
30 | 	base = from & ~((unsigned long)BITS_PER_LONG - 1);
31 | 	map += from / BITS_PER_LONG;
32 | 
33 | 	while (base < total) {
34 | 		bits = *map;
35 | 
36 | 		while (bits) {
37 | 			bit = ffsl(bits) - 1;
38 | 			nr = base + bit;
39 | 
40 | 			if (nr >= from)
41 | 				return min(nr, total);
42 | 
43 | 			bits &= ~(1UL << bit);
44 | 		}
45 | 
46 | 		base += BITS_PER_LONG;
47 | 		map++;
48 | 	}
49 | 
50 | 	return total;
51 | }
52 | 
53 | unsigned long *alloc_bits(u64 max)
54 | {
55 | 	return calloc(DIV_ROUND_UP(max, BITS_PER_LONG), sizeof(unsigned long));
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/src/bitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BITMAP_H_
 2 | #define _BITMAP_H_
 3 | 
 4 | void set_bit(unsigned long *bits, u64 nr);
 5 | void clear_bit(unsigned long *bits, u64 nr);
 6 | u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
 7 | unsigned long *alloc_bits(u64 max);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/src/bitops.h:
--------------------------------------------------------------------------------
  1 | #ifndef _BITOPS_H_
  2 | #define _BITOPS_H_
  3 | 
  4 | #include "sparse.h"
  5 | 
  6 | /*
  7 |  * Implement little endian bitmaps in terms of native longs.  __packed
  8 |  * is used to avoid unaligned accesses.  These are neither atomic nor
  9 |  * particularly efficient.
 10 |  */
 11 | 
 12 | #define BITS_PER_LONG (sizeof(long) * 8)
 13 | #if __BYTE_ORDER == __LITTLE_ENDIAN
 14 | #define BITOP_LE_SWIZZLE        0
 15 | #else
 16 | #define BITOP_LE_SWIZZLE        ((BITS_PER_LONG-1) & ~0x7)
 17 | #endif
 18 | 
 19 | static inline unsigned long get_nr_word(int nr, void *addr)
 20 | {
 21 | 	unsigned long *longs = addr;
 22 | 	unsigned long ind = nr / BITS_PER_LONG;
 23 | 	unsigned long val;
 24 | 
 25 | 	memcpy(&val, &longs[ind], sizeof(val));
 26 | 
 27 | 	return val;
 28 | }
 29 | 
 30 | static inline void put_nr_word(int nr, void *addr, unsigned long val)
 31 | {
 32 | 	unsigned long *longs = addr;
 33 | 	unsigned long ind = nr / BITS_PER_LONG;
 34 | 
 35 | 	memcpy(&longs[ind], &val, sizeof(val));
 36 | }
 37 | 
 38 | static inline unsigned long nr_mask(int nr)
 39 | {
 40 | 	return 1UL << (nr % BITS_PER_LONG);
 41 | }
 42 | 
 43 | static inline int test_bit(int nr, void *addr)
 44 | {
 45 | 	unsigned long val = get_nr_word(nr, addr);
 46 | 
 47 | 	return !!(val & nr_mask(nr));
 48 | }
 49 | 
 50 | static inline void set_bit(int nr, void *addr)
 51 | {
 52 | 	unsigned long val = get_nr_word(nr, addr);
 53 | 
 54 | 	val |= nr_mask(nr);
 55 | 	put_nr_word(nr, addr, val);
 56 | }
 57 | 
 58 | static inline void clear_bit(int nr, void *addr)
 59 | {
 60 | 	unsigned long val = get_nr_word(nr, addr);
 61 | 
 62 | 	val &= ~nr_mask(nr);
 63 | 	put_nr_word(nr, addr, val);
 64 | }
 65 | 
 66 | static inline int test_bit_le(int nr, void *addr)
 67 | {
 68 | 	return test_bit(nr ^ BITOP_LE_SWIZZLE, addr);
 69 | }
 70 | 
 71 | static inline int test_and_set_bit_le(int nr, void *addr)
 72 | {
 73 | 	int ret;
 74 | 
 75 | 	nr ^= BITOP_LE_SWIZZLE;
 76 | 	ret = test_bit(nr, addr);
 77 | 	set_bit(nr, addr);
 78 | 	return ret;
 79 | }
 80 | 
 81 | static inline void set_bit_le(int nr, void *addr)
 82 | {
 83 | 	set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
 84 | }
 85 | 
 86 | static inline void clear_bit_le(int nr, void *addr)
 87 | {
 88 | 	clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
 89 | }
 90 | 
 91 | static inline int test_and_clear_bit_le(int nr, void *addr)
 92 | {
 93 | 	int ret;
 94 | 
 95 | 	nr ^= BITOP_LE_SWIZZLE;
 96 | 	ret = test_bit(nr, addr);
 97 | 	clear_bit(nr, addr);
 98 | 	return ret;
 99 | }
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/src/btree.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | 
 3 | #include "sparse.h"
 4 | #include "util.h"
 5 | #include "format.h"
 6 | #include "key.h"
 7 | #include "avl.h"
 8 | #include "leaf_item_hash.h"
 9 | #include "btree.h"
10 | 
11 | static void init_block(struct scoutfs_btree_block *bt, int level)
12 | {
13 | 	int free;
14 | 
15 | 	free = SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block);
16 | 	if (level == 0)
17 | 		free -= SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES;
18 | 
19 | 	bt->level = level;
20 | 	bt->mid_free_len = cpu_to_le16(free);
21 | }
22 | 
23 | /*
24 |  * Point the root at the single leaf block that makes up a btree.
25 |  */
26 | void btree_init_root_single(struct scoutfs_btree_root *root,
27 | 			    struct scoutfs_btree_block *bt,
28 | 			    u64 blkno, u64 seq, __le64 fsid)
29 | {
30 | 	root->ref.blkno = cpu_to_le64(blkno);
31 | 	root->ref.seq = cpu_to_le64(1);
32 | 	root->height = 1;
33 | 
34 | 	memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);
35 | 	bt->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_BTREE);
36 | 	bt->hdr.fsid = fsid;
37 | 	bt->hdr.blkno = cpu_to_le64(blkno);
38 | 	bt->hdr.seq = cpu_to_le64(1);
39 | 
40 | 	init_block(bt, 0);
41 | }
42 | 
43 | static void *alloc_val(struct scoutfs_btree_block *bt, int len)
44 | {
45 | 	le16_add_cpu(&bt->mid_free_len, -len);
46 | 	le16_add_cpu(&bt->total_item_bytes, len);
47 | 	return (void *)bt + le16_to_cpu(bt->mid_free_len);
48 | }
49 | 
50 | /*
51 |  * Add a sorted item after all the items in the block.
52 |  *
53 |  * We simply implement the special case of a wildly imbalanced avl tree.
54 |  * Mkfs only ever inserts a handful of items and they'll be rebalanced
55 |  * over time.
56 |  */
57 | void btree_append_item(struct scoutfs_btree_block *bt,
58 | 		       struct scoutfs_key *key, void *val, int val_len)
59 | {
60 | 	struct scoutfs_btree_item *item;
61 | 	struct scoutfs_avl_node *prev;
62 | 	void *val_buf;
63 | 
64 | 	item = &bt->items[le16_to_cpu(bt->nr_items)];
65 | 
66 | 	if (bt->nr_items) {
67 | 		assert(scoutfs_key_compare(key, &(item - 1)->key) > 0);
68 | 		prev = &(item - 1)->node;
69 | 
70 | 		item->node.height = prev->height++;
71 | 		item->node.left = avl_node_off(&bt->item_root, prev);
72 | 		prev->parent = avl_node_off(&bt->item_root, &item->node);
73 | 	}
74 | 
75 | 	bt->item_root.node = avl_node_off(&bt->item_root, &item->node);
76 | 	le16_add_cpu(&bt->nr_items, 1);
77 | 	le16_add_cpu(&bt->mid_free_len,
78 | 		     -(u16)sizeof(struct scoutfs_btree_item));
79 | 	le16_add_cpu(&bt->total_item_bytes, sizeof(struct scoutfs_btree_item));
80 | 
81 | 	item->key = *key;
82 | 	leaf_item_hash_insert(bt, &item->key,
83 | 			      cpu_to_le16((void *)item - (void *)bt));
84 | 	if (val_len == 0)
85 | 		return;
86 | 
87 | 	val_buf = alloc_val(bt, val_len);
88 | 	item->val_off = cpu_to_le16((void *)val_buf - (void *)bt);
89 | 	item->val_len = cpu_to_le16(val_len);
90 | 	memcpy(val_buf, val, val_len);
91 | }
92 | 


--------------------------------------------------------------------------------
/src/btree.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BTREE_H_
 2 | #define _BTREE_H_
 3 | 
 4 | void btree_init_root_single(struct scoutfs_btree_root *root,
 5 | 			    struct scoutfs_btree_block *bt,
 6 | 			    u64 blkno, u64 seq, __le64 fsid);
 7 | 
 8 | void btree_append_item(struct scoutfs_btree_block *bt,
 9 | 		       struct scoutfs_key *key, void *val, int val_len);
10 | 
11 | #endif
12 | 


--------------------------------------------------------------------------------
/src/cmd.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | #include <stdbool.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "cmd.h"
 9 | #include "util.h"
10 | 
11 | static struct command {
12 | 	char *name;
13 | 	char *opts;
14 | 	char *summary;
15 | 	int (*func)(int argc, char **argv);
16 | } cmds[100], *next_cmd = cmds;
17 | 
18 | #define cmd_for_each(com) for (com = cmds; com->func; com++)
19 | 
20 | void cmd_register(char *name, char *opts, char *summary,
21 | 		  int (*func)(int argc, char **argv))
22 | {
23 | 	struct command *com = next_cmd++;
24 | 
25 | 	assert((com - cmds) < array_size(cmds));
26 | 
27 | 	com->name = name;
28 | 	com->opts = opts;
29 | 	com->summary = summary;
30 | 	com->func = func;
31 | }
32 | 
33 | static struct command *find_command(char *name)
34 | {
35 | 	struct command *com;
36 | 
37 | 	cmd_for_each(com) {
38 | 		if (!strcmp(name, com->name))
39 | 			return com;
40 | 	}
41 | 
42 | 	return NULL;
43 | }
44 | 
45 | static void usage(void)
46 | {
47 | 	struct command *com;
48 | 	int largest = 0;
49 | 
50 | 	fprintf(stderr, "usage: scoutfs <command> [<args>]\n"
51 | 	       "Commands:\n");
52 | 
53 | 	cmd_for_each(com)
54 | 		largest = max(strlen(com->name), largest);
55 | 
56 | 	cmd_for_each(com) {
57 | 		fprintf(stderr, "  %*s %s\n  %*s %s\n",
58 | 			largest, com->name, com->opts,
59 | 			largest, "", com->summary);
60 | 	}
61 | }
62 | 
63 | /* this returns a positive unix return code on error for some reason */
64 | char cmd_execute(int argc, char **argv)
65 | {
66 | 	struct command *com = NULL;
67 | 	int ret;
68 | 
69 | 	if (argc > 1) {
70 | 		com = find_command(argv[1]);
71 | 		if (!com)
72 | 			fprintf(stderr, "scoutfs: unrecognized command: '%s'\n",
73 | 				argv[1]);
74 | 	}
75 | 	if (!com) {
76 | 		usage();
77 | 		return 1;
78 | 	}
79 | 
80 | 	ret = com->func(argc - 1, argv + 1);
81 | 	if (ret < 0) {
82 | 		fprintf(stderr, "scoutfs: %s failed: %s (%d)\n",
83 | 			com->name, strerror(-ret), -ret);
84 | 		return 1;
85 | 	}
86 | 
87 | 	return 0;
88 | }
89 | 


--------------------------------------------------------------------------------
/src/cmd.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CMD_H_
 2 | #define _CMD_H_
 3 | 
 4 | void cmd_register(char *name, char *opts, char *summary,
 5 | 		  int (*func)(int argc, char **argv));
 6 | 
 7 | char cmd_execute(int argc, char **argv);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/src/cmp.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SCOUTFS_CMP_H_
 2 | #define _SCOUTFS_CMP_H_
 3 | 
 4 | /*
 5 |  * A generic ternary comparison macro with strict type checking.
 6 |  */
 7 | #define scoutfs_cmp(a, b)				\
 8 | ({							\
 9 | 	__typeof__(a) _a = (a);				\
10 | 	__typeof__(b) _b = (b);				\
11 | 	int _ret;					\
12 | 							\
13 | 	(void) (&_a == &_b);				\
14 | 	_ret = _a < _b ? -1 : _a > _b ? 1 : 0;		\
15 | 	_ret;						\
16 | })
17 | 
18 | static inline int scoutfs_cmp_u64s(u64 a, u64 b)
19 | {
20 | 	return a < b ? -1 : a > b ? 1 : 0;
21 | }
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/counters.c:
--------------------------------------------------------------------------------
  1 | #define _XOPEN_SOURCE 700 /* openat */
  2 | 
  3 | #include <stdlib.h>
  4 | #include <unistd.h>
  5 | #include <stdio.h>
  6 | #include <sys/types.h>
  7 | #include <sys/stat.h>
  8 | #include <fcntl.h>
  9 | #include <errno.h>
 10 | #include <string.h>
 11 | #include <limits.h>
 12 | #include <dirent.h>
 13 | #include <sys/ioctl.h>
 14 | #include <stdbool.h>
 15 | 
 16 | #include "util.h"
 17 | #include "cmd.h"
 18 | 
 19 | struct counter {
 20 | 	char *name;
 21 | 	char *val;
 22 | 	unsigned int name_wid;
 23 | 	unsigned int val_wid;
 24 | };
 25 | 
 26 | static int dots(char *name)
 27 | {
 28 | 	return name[0] == '.' &&
 29 | 	       (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
 30 | }
 31 | 
 32 | static int cmp_counter_names(const void *A, const void *B)
 33 | {
 34 | 	const struct counter *a = A;
 35 | 	const struct counter *b = B;
 36 | 
 37 | 	return strcmp(a->name, b->name);
 38 | }
 39 | 
 40 | static int counters_cmd(int argc, char **argv)
 41 | {
 42 | 	unsigned int *name_wid = NULL;
 43 | 	unsigned int *val_wid = NULL;
 44 | 	struct counter *ctrs = NULL;
 45 | 	struct counter *ctr;
 46 | 	char path[PATH_MAX + 1];
 47 | 	unsigned int alloced = 0;
 48 | 	unsigned int min_rows;
 49 | 	unsigned int max_rows;
 50 | 	unsigned int rows = 0;
 51 | 	unsigned int cols = 0;
 52 | 	unsigned int nr = 0;
 53 | 	char *dir_arg = NULL;
 54 | 	struct dirent *dent;
 55 | 	bool table = false;
 56 | 	struct winsize ws;
 57 | 	DIR *dirp = NULL;
 58 | 	int dir_fd = -1;
 59 | 	char buf[25];
 60 | 	int room;
 61 | 	int ret;
 62 | 	int fd;
 63 | 	int i;
 64 | 	int r;
 65 | 	int c;
 66 | 
 67 | 	for (i = 1; i < argc; i++) {
 68 | 		if (strcmp(argv[i], "-t") == 0)
 69 | 			table = true;
 70 | 		else
 71 | 			dir_arg = argv[i];
 72 | 	}
 73 | 
 74 | 	ret = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
 75 | 	if (ret < 0)
 76 | 		ret = ioctl(STDIN_FILENO, TIOCGWINSZ, &ws);
 77 | 	if (ret < 0)
 78 | 		table = false;
 79 | 
 80 | 	if (dir_arg == NULL) {
 81 | 		printf("scoutfs counter-table: need mount sysfs dir (i.e. /sys/fs/scoutfs/$fr)\n");
 82 | 		return -EINVAL;
 83 | 	}
 84 | 
 85 | 	ret = snprintf(path, PATH_MAX, "%s/counters", dir_arg);
 86 | 	if (ret < 1 || ret >= PATH_MAX) {
 87 | 		ret = -EINVAL;
 88 | 		fprintf(stderr, "invalid counter dir path '%s'\n", dir_arg);
 89 | 		goto out;
 90 | 	}
 91 | 
 92 | 	dirp = opendir(path);
 93 | 	if (!dirp) {
 94 | 		ret = -errno;
 95 | 		fprintf(stderr, "failed to open sysfs counter dir '%s': %s (%d)\n",
 96 | 			path, strerror(errno), errno);
 97 | 		goto out;
 98 | 	}
 99 | 
100 | 	dir_fd = dup(dirfd(dirp));
101 | 	if (dir_fd < 0) {
102 | 		ret = -errno;
103 | 		fprintf(stderr, "couldn't dup fd for path '%s': %s (%d)\n",
104 | 			path, strerror(errno), errno);
105 | 		goto out;
106 | 	}
107 | 
108 | 	/* read all the counters */
109 | 	while ((dent = readdir(dirp))) {
110 | 		if (dots(dent->d_name))
111 | 			continue;
112 | 		if (nr == alloced) {
113 | 			alloced += 100;
114 | 			ctrs = realloc(ctrs, alloced * sizeof(*ctrs));
115 | 			name_wid = realloc(name_wid, alloced * sizeof(*name_wid));
116 | 			val_wid = realloc(val_wid, alloced * sizeof(*val_wid));
117 | 			if (!ctrs || !name_wid || !val_wid) {
118 | 				fprintf(stderr, "counter array allocation error\n");
119 | 				ret = -ENOMEM;
120 | 				goto out;
121 | 			}
122 | 			memset(&ctrs[nr], 0, (alloced - nr) * sizeof(*ctrs));
123 | 		}
124 | 
125 | 		ctr = &ctrs[nr];
126 | 
127 | 		ctr->name = strdup(dent->d_name);
128 | 		if (ctr->name == NULL) {
129 | 			fprintf(stderr, "name string allocation error\n");
130 | 			ret = -ENOMEM;
131 | 			goto out;
132 | 		}
133 | 
134 | 		fd = openat(dir_fd, ctr->name, O_RDONLY);
135 | 		if (fd < 0) {
136 | 			ret = -errno;
137 | 			fprintf(stderr, "failed to open counter file '%s/%s': %s (%d)\n",
138 | 				path, ctr->name, strerror(errno), errno);
139 | 			goto out;
140 | 		}
141 | 
142 | 		ret = pread(fd, buf, sizeof(buf), 0);
143 | 		close(fd);
144 | 
145 | 		if (ret <= 1 || ret >= sizeof(buf) || buf[ret - 1] != '\n') {
146 | 			fprintf(stderr, "counter file %s/%s read returned %d\n",
147 | 				path, ctr->name, ret);
148 | 			ret = -EIO;
149 | 			goto out;
150 | 		}
151 | 
152 | 		buf[ret - 1] = '\0';
153 | 		ctr->val = strdup(buf);
154 | 		if (ctr->val == NULL) {
155 | 			fprintf(stderr, "value string allocation error\n");
156 | 			ret = -ENOMEM;
157 | 			goto out;
158 | 		}
159 | 
160 | 		ctr->name_wid = strlen(ctr->name);
161 | 		ctr->val_wid = strlen(ctr->val);
162 | 
163 | 		name_wid[0] = max(ctr->name_wid, name_wid[0]);
164 | 		val_wid[0] = max(ctr->val_wid, val_wid[0]);
165 | 
166 | 		nr++;
167 | 	}
168 | 	closedir(dirp);
169 | 	dirp = NULL;
170 | 	close(dir_fd);
171 | 	dir_fd = -1;
172 | 
173 | 	/* huh, empty counter dir */
174 | 	if (nr == 0) {
175 | 		ret = 0;
176 | 		goto out;
177 | 	}
178 | 
179 | 	/* sort counters by name */
180 | 	qsort(ctrs, nr, sizeof(ctrs[0]), cmp_counter_names);
181 | 
182 | 	/*
183 | 	 * If we're packing the counters into a table that fills the
184 | 	 * width of the terminal then there will be a smallest number of
185 | 	 * rows in the table that packs counters into columns that fill
186 | 	 * the width of the terminal.  We perform a binary search for
187 | 	 * that smallest number of rows that doesn't fill too many
188 | 	 * columns.
189 | 	 *
190 | 	 * Unless we're not outputting a table, then we just spit out
191 | 	 * one column of counters and use the max field widths from the
192 | 	 * initial counter reads.
193 | 	 */
194 | 	if (table) {
195 | 		min_rows = 1;
196 | 		cols = ws.ws_col / (name_wid[0] + 1 + val_wid[0] + 2);
197 | 		max_rows = nr / cols;
198 | 	} else {
199 | 		rows = nr;
200 | 		cols = 1;
201 | 		min_rows = nr + 1;
202 | 		max_rows = nr - 1;
203 | 	}
204 | 
205 | 	while (min_rows <= max_rows) {
206 | 		rows = min_rows + ((max_rows - min_rows) / 2);
207 | 		i = 0;
208 | 		room = ws.ws_col;
209 | 
210 | 		/*
211 | 		 * Iterate over counters, storing the max field widths
212 | 		 * of each column, recording the column chars left in
213 | 		 * the terminal, stopping if we fill too many columns
214 | 		 * for the terminal.
215 | 		 */
216 | 		for (c = 0; i < nr && room >= 0; c++) {
217 | 			name_wid[c] = 0;
218 | 			val_wid[c] = 0;
219 | 
220 | 			for (r = 0; r < rows && i < nr; r++, i++) {
221 | 				ctr = &ctrs[i];
222 | 
223 | 				name_wid[c] = max(ctr->name_wid, name_wid[c]);
224 | 				val_wid[c] = max(ctr->val_wid, val_wid[c]);
225 | 			}
226 | 
227 | 			cols = c + 1;
228 | 			if (c > 0)
229 | 				room -= 2;
230 | 			room -= name_wid[c] + 1 + val_wid[c];
231 | 		}
232 | 
233 | 		if (room < 0) {
234 | 			/* need more rows if we ran out of cols */
235 | 			min_rows = rows + 1;
236 | 		} else {
237 | 			/* see if we can get away with fewer */
238 | 			if (max_rows == rows)
239 | 				break;
240 | 			max_rows = rows;
241 | 		}
242 | 	}
243 | 
244 | 	/* finally output the columns in each row */ 
245 | 	for (r = 0; r < rows; r++) {
246 | 		for (c = 0; c < cols; c++) {
247 | 			i = (c * rows) + r;
248 | 			if (i >= nr)
249 | 				break;
250 | 			ctr = &ctrs[i];
251 | 
252 | 			printf("%s%-*s %*s",
253 | 			       c > 0 ? "  " : "",
254 | 			       name_wid[c], ctr->name,
255 | 			       val_wid[c], ctr->val);
256 | 		}
257 | 		printf("\n");
258 | 	}
259 | 
260 | 	ret = 0;
261 | out:
262 | 	if (dirp)
263 | 		closedir(dirp);
264 | 	if (dir_fd >= 0)
265 | 		close(dir_fd);
266 | 	if (ctrs) {
267 | 		for (i = 0; i < alloced; i++) {
268 | 			free(ctrs[i].name);
269 | 			free(ctrs[i].val);
270 | 		}
271 | 		free(ctrs);
272 | 	}
273 | 	free(name_wid);
274 | 	free(val_wid);
275 | 
276 | 	return ret;
277 | };
278 | 
279 | static void __attribute__((constructor)) counters_ctor(void)
280 | {
281 | 	cmd_register("counters", "[-t] <sysfs dir>",
282 | 		     "show [tablular] counters for a given mounted volume",
283 | 		     counters_cmd);
284 | }
285 | 


--------------------------------------------------------------------------------
/src/crc.c:
--------------------------------------------------------------------------------
 1 | #include "crc.h"
 2 | #include "util.h"
 3 | #include "format.h"
 4 | 
 5 | u32 crc32c(u32 crc, const void *data, unsigned int len)
 6 | {
 7 | 	while (len >= 8) {
 8 | 		crc = __builtin_ia32_crc32di(crc, *(u64 *)data);
 9 | 		len -= 8;
10 | 		data += 8;
11 | 	}
12 | 	if (len & 4) {
13 | 		crc = __builtin_ia32_crc32si(crc, *(u32 *)data);
14 | 		data += 4;
15 | 	}
16 | 	if (len & 2) {
17 | 		crc = __builtin_ia32_crc32hi(crc, *(u16 *)data);
18 | 		data += 2;
19 | 	}
20 | 	if (len & 1)
21 | 		crc = __builtin_ia32_crc32qi(crc, *(u8 *)data);
22 | 
23 | 	return crc;
24 | }
25 | 
26 | /* A simple hack to get reasonably solid 64bit hash values */
27 | u64 crc32c_64(u32 crc, const void *data, unsigned int len)
28 | {
29 | 	unsigned int half = (len + 1) / 2;
30 | 
31 | 	return ((u64)crc32c(crc, data, half) << 32) |
32 | 		     crc32c(~crc, data + len - half, half);
33 | }
34 | 
35 | u32 crc_block(struct scoutfs_block_header *hdr, u32 size)
36 | {
37 | 	return crc32c(~0, (char *)hdr + sizeof(hdr->crc),
38 | 		      size - sizeof(hdr->crc));
39 | }
40 | 


--------------------------------------------------------------------------------
/src/crc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CRC_H_
 2 | #define _CRC_H_
 3 | 
 4 | #include "sparse.h"
 5 | #include "util.h"
 6 | #include "format.h"
 7 | 
 8 | u32 crc32c(u32 crc, const void *data, unsigned int len);
 9 | u64 crc32c_64(u32 crc, const void *data, unsigned int len);
10 | u32 crc_block(struct scoutfs_block_header *hdr, u32 size);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/dev.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <unistd.h>
  6 | #include <sys/ioctl.h>
  7 | #include <linux/fs.h>
  8 | #include <errno.h>
  9 | 
 10 | #include "sparse.h"
 11 | #include "dev.h"
 12 | 
 13 | int device_size(char *path, int fd,
 14 | 		u64 min_size, u64 max_size,
 15 | 		char *use_type, u64 *size_ret)
 16 | {
 17 | 	struct stat st;
 18 | 	u64 size;
 19 | 	char *target_type;
 20 | 	int ret;
 21 | 
 22 | 	if (fstat(fd, &st)) {
 23 | 		ret = -errno;
 24 | 		fprintf(stderr, "failed to stat '%s': %s (%d)\n",
 25 | 			path, strerror(errno), errno);
 26 | 		return ret;
 27 | 	}
 28 | 
 29 | 	if (S_ISREG(st.st_mode)) {
 30 | 		size = st.st_size;
 31 | 		target_type = "file";
 32 | 	} else if (S_ISBLK(st.st_mode)) {
 33 | 		if (ioctl(fd, BLKGETSIZE64, &size)) {
 34 | 			ret = -errno;
 35 | 			fprintf(stderr, "BLKGETSIZE64 failed '%s': %s (%d)\n",
 36 | 				path, strerror(errno), errno);
 37 | 			return ret;
 38 | 		}
 39 | 		target_type = "device";
 40 | 	} else {
 41 | 		fprintf(stderr, "path isn't regular or device file '%s'\n",
 42 | 			path);
 43 | 		return -EINVAL;
 44 | 	}
 45 | 
 46 | 	if (max_size) {
 47 | 		if (size > max_size) {
 48 | 			printf("Limiting use of "BASE_SIZE_FMT
 49 | 			       " %s device to "BASE_SIZE_FMT"\n",
 50 | 			       BASE_SIZE_ARGS(size), use_type,
 51 | 			       BASE_SIZE_ARGS(max_size));
 52 | 			size = max_size;
 53 | 		} else if (size < max_size) {
 54 | 			printf("Device size limit of "BASE_SIZE_FMT
 55 | 			       " for %s device"
 56 | 			       " is greater than "BASE_SIZE_FMT
 57 | 			       " available, ignored.\n",
 58 | 			       BASE_SIZE_ARGS(max_size), use_type,
 59 | 			       BASE_SIZE_ARGS(size));
 60 | 		}
 61 | 	}
 62 | 
 63 | 	if (size < min_size) {
 64 | 		fprintf(stderr,
 65 | 			BASE_SIZE_FMT" %s too small for min "
 66 | 			BASE_SIZE_FMT" %s device\n",
 67 | 			BASE_SIZE_ARGS(size), target_type,
 68 | 			BASE_SIZE_ARGS(min_size), use_type);
 69 | 		return -EINVAL;
 70 | 	}
 71 | 
 72 | 	*size_ret = size;
 73 | 
 74 | 	return 0;
 75 | }
 76 | 
 77 | float size_flt(u64 nr, unsigned size)
 78 | {
 79 | 	float x = (float)nr * (float)size;
 80 | 
 81 | 	while (x >= 1024)
 82 | 		x /= 1024;
 83 | 
 84 | 	return x;
 85 | }
 86 | 
 87 | char *size_str(u64 nr, unsigned size)
 88 | {
 89 | 	float x = (float)nr * (float)size;
 90 | 	static char *suffixes[] = {
 91 | 		"B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB",
 92 | 	};
 93 | 	int i = 0;
 94 | 
 95 | 	while (x >= 1024) {
 96 | 		x /= 1024;
 97 | 		i++;
 98 | 	}
 99 | 
100 | 	return suffixes[i];
101 | }
102 | 


--------------------------------------------------------------------------------
/src/dev.h:
--------------------------------------------------------------------------------
 1 | #ifndef _DEV_H_
 2 | #define _DEV_H_
 3 | 
 4 | #define BASE_SIZE_FMT "%.2f %s"
 5 | #define BASE_SIZE_ARGS(sz) size_flt(sz, 1), size_str(sz, 1)
 6 | 
 7 | #define SIZE_FMT "%llu (%.2f %s)"
 8 | #define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)
 9 | 
10 | int device_size(char *path, int fd,
11 | 		u64 min_size, u64 max_size,
12 | 		char *use_type, u64 *size_ret);
13 | float size_flt(u64 nr, unsigned size);
14 | char *size_str(u64 nr, unsigned size);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/df.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <getopt.h>
 11 | #include <assert.h>
 12 | 
 13 | #include "sparse.h"
 14 | #include "util.h"
 15 | #include "format.h"
 16 | #include "ioctl.h"
 17 | #include "cmd.h"
 18 | 
 19 | #define ROWS 3
 20 | #define COLS 6
 21 | #define CHARS 20
 22 | 
 23 | static int df_cmd(int argc, char **argv)
 24 | {
 25 | 	struct scoutfs_ioctl_alloc_detail ad;
 26 | 	struct scoutfs_ioctl_alloc_detail_entry *ade = NULL;
 27 | 	struct scoutfs_ioctl_statfs_more sfm;
 28 | 	static char cells[ROWS][COLS][CHARS];
 29 | 	int wid[COLS] = {0};
 30 | 	u64 nr = 4096 / sizeof(*ade);
 31 | 	u64 meta_free = 0;
 32 | 	u64 data_free = 0;
 33 | 	int ret;
 34 | 	int fd;
 35 | 	int i;
 36 | 	int r;
 37 | 	int c;
 38 | 
 39 | 	if (argc != 2) {
 40 | 		fprintf(stderr, "must specify path\n");
 41 | 		return -EINVAL;
 42 | 	}
 43 | 
 44 | 	fd = open(argv[1], O_RDONLY);
 45 | 	if (fd < 0) {
 46 | 		ret = -errno;
 47 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 48 | 			argv[1], strerror(errno), errno);
 49 | 		return ret;
 50 | 	}
 51 | 
 52 | 	sfm.valid_bytes = sizeof(struct scoutfs_ioctl_statfs_more);
 53 | 	ret = ioctl(fd, SCOUTFS_IOC_STATFS_MORE, &sfm);
 54 | 	if (ret < 0) {
 55 | 		fprintf(stderr, "statfs_more returned %d: error %s (%d)\n",
 56 | 			ret, strerror(errno), errno);
 57 | 		ret = -EIO;
 58 | 		goto out;
 59 | 	}
 60 | 
 61 | 	do {
 62 | 		free(ade);
 63 | 		ade = calloc(nr, sizeof(*ade));
 64 | 		if (!ade) {
 65 | 			ret = -ENOMEM;
 66 | 			goto out;
 67 | 		}
 68 | 
 69 | 		ad.entries_ptr = (intptr_t)ade;
 70 | 		ad.entries_nr = nr;
 71 | 		ret = ioctl(fd, SCOUTFS_IOC_ALLOC_DETAIL, &ad);
 72 | 		if (ret < 0 && errno == EOVERFLOW)
 73 | 			nr = nr + (nr >> 2);
 74 | 	} while (ret < 0 && errno == EOVERFLOW);
 75 | 
 76 | 	if (ret < 0) {
 77 | 		fprintf(stderr, "alloc_detail returned %d: error %s (%d)\n",
 78 | 			ret, strerror(errno), errno);
 79 | 		ret = -EIO;
 80 | 		goto out;
 81 | 	}
 82 | 
 83 | 	for (i = 0; i < ret; i++) {
 84 | 		if (ade[i].meta)
 85 | 			meta_free += ade[i].blocks;
 86 | 		else
 87 | 			data_free += ade[i].blocks;
 88 | 	}
 89 | 
 90 | 	snprintf(cells[0][0], CHARS, "Type");
 91 | 	snprintf(cells[0][1], CHARS, "Size");
 92 | 	snprintf(cells[0][2], CHARS, "Total");
 93 | 	snprintf(cells[0][3], CHARS, "Used");
 94 | 	snprintf(cells[0][4], CHARS, "Free");
 95 | 	snprintf(cells[0][5], CHARS, "Use%%");
 96 | 
 97 | 	snprintf(cells[1][0], CHARS, "MetaData");
 98 | 	snprintf(cells[1][1], CHARS, "64KB");
 99 | 	snprintf(cells[1][2], CHARS, "%llu", sfm.total_meta_blocks);
100 | 	snprintf(cells[1][3], CHARS, "%llu", sfm.total_meta_blocks - meta_free);
101 | 	snprintf(cells[1][4], CHARS, "%llu", meta_free);
102 | 	snprintf(cells[1][5], CHARS, "%llu",
103 | 		((sfm.total_meta_blocks - meta_free) * 100) /
104 | 		sfm.total_meta_blocks);
105 | 
106 | 	snprintf(cells[2][0], CHARS, "Data");
107 | 	snprintf(cells[2][1], CHARS, "4KB");
108 | 	snprintf(cells[2][2], CHARS, "%llu", sfm.total_data_blocks);
109 | 	snprintf(cells[2][3], CHARS, "%llu", sfm.total_data_blocks - data_free);
110 | 	snprintf(cells[2][4], CHARS, "%llu", data_free);
111 | 	snprintf(cells[2][5], CHARS, "%llu",
112 | 		((sfm.total_data_blocks - data_free) * 100) /
113 | 		sfm.total_data_blocks);
114 | 
115 | 	for (r = 0; r < ROWS; r++) {
116 | 		for (c = 0; c < COLS; c++) {
117 | 			wid[c] = max(wid[c], strlen(cells[r][c]));
118 | 		}
119 | 	}
120 | 
121 | 	for (r = 0; r < ROWS; r++) {
122 | 		for (c = 0; c < COLS; c++) {
123 | 			printf("%*s  ", wid[c], cells[r][c]);
124 | 		}
125 | 		printf("\n");
126 | 	}
127 | 
128 | 	ret = 0;
129 | out:
130 | 	free(ade);
131 | 	return ret;
132 | }
133 | 
134 | static void __attribute__((constructor)) df_ctor(void)
135 | {
136 | 	cmd_register("df", "<path>",
137 | 		     "show metadata and data block usage", df_cmd);
138 | }
139 | 


--------------------------------------------------------------------------------
/src/endian_swap.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SCOUTFS_ENDIAN_SWAP_H_
 2 | #define _SCOUTFS_ENDIAN_SWAP_H_
 3 | 
 4 | #define le64_to_be64(x) cpu_to_be64(le64_to_cpu(x))
 5 | #define le32_to_be32(x) cpu_to_be32(le32_to_cpu(x))
 6 | #define le16_to_be16(x) cpu_to_be16(le16_to_cpu(x))
 7 | 
 8 | #define be64_to_le64(x) cpu_to_le64(be64_to_cpu(x))
 9 | #define be32_to_le32(x) cpu_to_le32(be32_to_cpu(x))
10 | #define be16_to_le16(x) cpu_to_le16(be16_to_cpu(x))
11 | 
12 | #define le16_to_le64(x) cpu_to_le64(le16_to_cpu(x))
13 | #define le32_to_le64(x) cpu_to_le64(le32_to_cpu(x))
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/format.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SCOUTFS_FORMAT_H_
  2 | #define _SCOUTFS_FORMAT_H_
  3 | 
  4 | /* statfs(2) f_type */
  5 | #define SCOUTFS_SUPER_MAGIC	0x554f4353		/* "SCOU" */
  6 | 
  7 | /* block header magic values, chosen at random */
  8 | #define SCOUTFS_BLOCK_MAGIC_SUPER	0x103c428b
  9 | #define SCOUTFS_BLOCK_MAGIC_BTREE	0xe597f96d
 10 | #define SCOUTFS_BLOCK_MAGIC_BLOOM	0x31995604
 11 | #define SCOUTFS_BLOCK_MAGIC_SRCH_BLOCK	0x897e4a7d
 12 | #define SCOUTFS_BLOCK_MAGIC_SRCH_PARENT	0xb23a2a05
 13 | #define SCOUTFS_BLOCK_MAGIC_ALLOC_LIST	0x8a93ac83
 14 | 
 15 | /*
 16 |  * The super block, quorum block, and file data allocation granularity
 17 |  * use the smaller 4KB block.
 18 |  */
 19 | #define SCOUTFS_BLOCK_SM_SHIFT		12
 20 | #define SCOUTFS_BLOCK_SM_SIZE		(1 << SCOUTFS_BLOCK_SM_SHIFT)
 21 | #define SCOUTFS_BLOCK_SM_MASK		(SCOUTFS_BLOCK_SM_SIZE - 1)
 22 | #define SCOUTFS_BLOCK_SM_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_SM_SIZE)
 23 | #define SCOUTFS_BLOCK_SM_SECTOR_SHIFT	(SCOUTFS_BLOCK_SM_SHIFT - 9)
 24 | #define SCOUTFS_BLOCK_SM_SECTORS	(1 << SCOUTFS_BLOCK_SM_SECTOR_SHIFT)
 25 | #define SCOUTFS_BLOCK_SM_MAX		(U64_MAX >> SCOUTFS_BLOCK_SM_SHIFT)
 26 | #define SCOUTFS_BLOCK_SM_PAGES_PER	(SCOUTFS_BLOCK_SM_SIZE / PAGE_SIZE)
 27 | #define SCOUTFS_BLOCK_SM_PAGE_ORDER	(SCOUTFS_BLOCK_SM_SHIFT - PAGE_SHIFT)
 28 | 
 29 | /*
 30 |  * The radix and btree structures, and the forest bloom block, use the
 31 |  * larger 64KB metadata block size.
 32 |  */
 33 | #define SCOUTFS_BLOCK_LG_SHIFT		16
 34 | #define SCOUTFS_BLOCK_LG_SIZE		(1 << SCOUTFS_BLOCK_LG_SHIFT)
 35 | #define SCOUTFS_BLOCK_LG_MASK		(SCOUTFS_BLOCK_LG_SIZE - 1)
 36 | #define SCOUTFS_BLOCK_LG_PER_PAGE	(PAGE_SIZE / SCOUTFS_BLOCK_LG_SIZE)
 37 | #define SCOUTFS_BLOCK_LG_SECTOR_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - 9)
 38 | #define SCOUTFS_BLOCK_LG_SECTORS	(1 << SCOUTFS_BLOCK_LG_SECTOR_SHIFT)
 39 | #define SCOUTFS_BLOCK_LG_MAX		(U64_MAX >> SCOUTFS_BLOCK_LG_SHIFT)
 40 | #define SCOUTFS_BLOCK_LG_PAGES_PER	(SCOUTFS_BLOCK_LG_SIZE / PAGE_SIZE)
 41 | #define SCOUTFS_BLOCK_LG_PAGE_ORDER	(SCOUTFS_BLOCK_LG_SHIFT - PAGE_SHIFT)
 42 | 
 43 | #define SCOUTFS_BLOCK_SM_LG_SHIFT	(SCOUTFS_BLOCK_LG_SHIFT - \
 44 | 					 SCOUTFS_BLOCK_SM_SHIFT)
 45 | 
 46 | 
 47 | /*
 48 |  * The super block leaves some room before the first block for platform
 49 |  * structures like boot loaders.
 50 |  */
 51 | #define SCOUTFS_SUPER_BLKNO ((64ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
 52 | 
 53 | /*
 54 |  * A reasonably large region of aligned quorum blocks follow the super
 55 |  * block.  Each voting cycle reads the entire region so we don't want it
 56 |  * to be too enormous.  256K seems like a reasonably chunky single IO.
 57 |  * The number of blocks in the region also determines the number of
 58 |  * mounts that have a reasonable probability of not overwriting each
 59 |  * other's random block locations.
 60 |  */
 61 | #define SCOUTFS_QUORUM_BLKNO	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
 62 | #define SCOUTFS_QUORUM_BLOCKS	((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
 63 | 
 64 | /*
 65 |  * Start data on the data device aligned as well.
 66 |  */
 67 | #define SCOUTFS_DATA_DEV_START_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
 68 | 
 69 | 
 70 | #define SCOUTFS_UNIQUE_NAME_MAX_BYTES	64 /* includes null */
 71 | 
 72 | /*
 73 |  * Base types used by other structures.
 74 |  */
 75 | struct scoutfs_timespec {
 76 | 	__le64 sec;
 77 | 	__le32 nsec;
 78 | 	__u8 __pad[4];
 79 | };
 80 | 
 81 | /* XXX ipv6 */
 82 | struct scoutfs_inet_addr {
 83 | 	__le32 addr;
 84 | 	__le16 port;
 85 | 	__u8 __pad[2];
 86 | };
 87 | 
 88 | /*
 89 |  * This header is stored at the start of btree blocks and the super
 90 |  * block for verification.  The crc field is not included in the
 91 |  * calculation of the crc.
 92 |  */
 93 | struct scoutfs_block_header {
 94 | 	__le32 crc;
 95 | 	__le32 magic;
 96 | 	__le64 fsid;
 97 | 	__le64 seq;
 98 | 	__le64 blkno;
 99 | };
100 | 
101 | /*
102 |  * scoutfs identifies all file system metadata items by a small key
103 |  * struct.
104 |  *
105 |  * Each item type maps their logical structures to the fixed fields in
106 |  * sort order.  This lets us print keys without needing per-type
107 |  * formats.
108 |  *
109 |  * The keys are compared by considering the fields in struct order from
110 |  * most to least significant.  They are considered a multi precision
111 |  * value when navigating the keys in ordered key space.  We can
112 |  * increment them, subtract them from each other, etc.
113 |  */
114 | struct scoutfs_key {
115 | 	__le64	_sk_first;
116 | 	__le64	_sk_second;
117 | 	__le64	_sk_third;
118 | 	__u8	_sk_fourth;
119 | 	__u8	sk_zone;
120 | 	__u8	sk_type;
121 | 	__u8	__pad[5];
122 | };
123 | 
124 | /* inode index */
125 | #define skii_major	_sk_second
126 | #define skii_ino	_sk_third
127 | 
128 | /* node orphan inode */
129 | #define sko_rid		_sk_first
130 | #define sko_ino		_sk_second
131 | 
132 | /* inode */
133 | #define ski_ino		_sk_first
134 | 
135 | /* xattr parts */
136 | #define skx_ino		_sk_first
137 | #define skx_name_hash	_sk_second
138 | #define skx_id		_sk_third
139 | #define skx_part	_sk_fourth
140 | 
141 | /* directory entries */
142 | #define skd_ino		_sk_first
143 | #define skd_major	_sk_second
144 | #define skd_minor	_sk_third
145 | 
146 | /* symlink target */
147 | #define sks_ino		_sk_first
148 | #define sks_nr		_sk_second
149 | 
150 | /* data extents */
151 | #define skdx_ino	_sk_first
152 | #define skdx_end	_sk_second
153 | #define skdx_len	_sk_third
154 | 
155 | /* log trees */
156 | #define sklt_rid	_sk_first
157 | #define sklt_nr		_sk_second
158 | 
159 | /* lock clients */
160 | #define sklc_rid	_sk_first
161 | 
162 | /* seqs */
163 | #define skts_trans_seq	_sk_first
164 | #define skts_rid	_sk_second
165 | 
166 | /* mounted clients */
167 | #define skmc_rid	_sk_first
168 | 
169 | /* free extents by blkno */
170 | #define skfb_end	_sk_second
171 | #define skfb_len	_sk_third
172 | /* free extents by len */
173 | #define skfl_neglen	_sk_second
174 | #define skfl_blkno	_sk_third
175 | 
176 | struct scoutfs_radix_block {
177 | 	struct scoutfs_block_header hdr;
178 | 	union {
179 | 		struct scoutfs_radix_ref {
180 | 			__le64 blkno;
181 | 			__le64 seq;
182 | 			__le64 sm_total;
183 | 			__le64 lg_total;
184 | 		} refs[0];
185 | 		__le64 bits[0];
186 | 	};
187 | };
188 | 
189 | struct scoutfs_avl_root {
190 | 	__le16 node;
191 | };
192 | 
193 | struct scoutfs_avl_node {
194 | 	__le16 parent;
195 | 	__le16 left;
196 | 	__le16 right;
197 | 	__u8 height;
198 | 	__u8 __pad[1];
199 | };
200 | 
201 | /* when we split we want to have multiple items on each side */
202 | #define SCOUTFS_BTREE_MAX_VAL_LEN 896
203 | 
204 | /*
205 |  * A 4EB test image measured a worst case height of 17.  This is plenty
206 |  * generous.
207 |  */
208 | #define SCOUTFS_BTREE_MAX_HEIGHT 20
209 | 
210 | struct scoutfs_btree_ref {
211 | 	__le64 blkno;
212 | 	__le64 seq;
213 | };
214 | 
215 | /*
216 |  * A height of X means that the first block read will have level X-1 and
217 |  * the leaves will have level 0.
218 |  */
219 | struct scoutfs_btree_root {
220 | 	struct scoutfs_btree_ref ref;
221 | 	__u8 height;
222 | 	__u8 __pad[7];
223 | };
224 | 
225 | struct scoutfs_btree_item {
226 | 	struct scoutfs_avl_node node;
227 | 	struct scoutfs_key key;
228 | 	__le16 val_off;
229 | 	__le16 val_len;
230 | 	__u8 __pad[4];
231 | };
232 | 
233 | struct scoutfs_btree_block {
234 | 	struct scoutfs_block_header hdr;
235 | 	struct scoutfs_avl_root item_root;
236 | 	__le16 nr_items;
237 | 	__le16 total_item_bytes;
238 | 	__le16 mid_free_len;
239 | 	__u8 level;
240 | 	__u8 __pad[7];
241 | 	struct scoutfs_btree_item items[0];
242 | 	/* leaf blocks have a fixed size item offset hash table at the end */
243 | };
244 | 
245 | #define SCOUTFS_BTREE_VALUE_ALIGN 8
246 | 
247 | /*
248 |  * Try to aim for a 75% load in a leaf full of items with no value.
249 |  * We'll almost never see this because most items have values and most
250 |  * blocks aren't full.
251 |  */
252 | #define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED			  \
253 | 	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_btree_block)) /	  \
254 | 	 (sizeof(struct scoutfs_btree_item) + (sizeof(__le16))) * 100 / 75)
255 | #define SCOUTFS_BTREE_LEAF_ITEM_HASH_NR					  \
256 | 	(round_up(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR_UNALIGNED,		  \
257 | 		  SCOUTFS_BTREE_VALUE_ALIGN))
258 | #define SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES \
259 | 	(SCOUTFS_BTREE_LEAF_ITEM_HASH_NR * sizeof(__le16))
260 | 
261 | struct scoutfs_alloc_list_ref {
262 | 	__le64 blkno;
263 | 	__le64 seq;
264 | };
265 | 
266 | /*
267 |  * first_nr tracks the nr of the first block in the list and is used for
268 |  * allocation sizing. total_nr is the sum of the nr of all the blocks in
269 |  * the list and is used for calculating total free block counts.
270 |  */
271 | struct scoutfs_alloc_list_head {
272 | 	struct scoutfs_alloc_list_ref ref;
273 | 	__le64 total_nr;
274 | 	__le32 first_nr;
275 | 	__u8 __pad[4];
276 | };
277 | 
278 | /*
279 |  * While the main allocator uses extent items in btree blocks, metadata
280 |  * allocations for a single transaction are recorded in arrays in
281 |  * blocks.  This limits the number of allocations and frees needed to
282 |  * cow and modify the structure.  The blocks can be stored in a list
283 |  * which lets us create a persistent log of pending frees that are
284 |  * generated as we cow btree blocks to insert freed extents.
285 |  *
286 |  * The array floats in the block so that both adding and removing blknos
287 |  * only modifies an index.
288 |  */
289 | struct scoutfs_alloc_list_block {
290 | 	struct scoutfs_block_header hdr;
291 | 	struct scoutfs_alloc_list_ref next;
292 | 	__le32 start;
293 | 	__le32 nr;
294 | 	__le64 blknos[0]; /* naturally aligned for sorting */
295 | };
296 | 
297 | #define SCOUTFS_ALLOC_LIST_MAX_BLOCKS					      \
298 | 	((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_alloc_list_block)) /  \
299 | 	 (member_sizeof(struct scoutfs_alloc_list_block, blknos[0])))
300 | 
301 | /*
302 |  * These can safely be initialized to all-zeros.
303 |  */
304 | struct scoutfs_alloc_root {
305 | 	__le64 total_len;
306 | 	struct scoutfs_btree_root root;
307 | };
308 | 
309 | /* types of allocators, exposed to alloc_detail ioctl */
310 | #define SCOUTFS_ALLOC_OWNER_NONE	0
311 | #define SCOUTFS_ALLOC_OWNER_SERVER	1
312 | #define SCOUTFS_ALLOC_OWNER_MOUNT	2
313 | #define SCOUTFS_ALLOC_OWNER_SRCH	3
314 | 
315 | struct scoutfs_mounted_client_btree_val {
316 | 	__u8 flags;
317 | };
318 | 
319 | #define SCOUTFS_MOUNTED_CLIENT_VOTER	(1 << 0)
320 | 
321 | /*
322 |  * srch files are a contiguous run of blocks with compressed entries
323 |  * described by a dense parent radix.  The files can be stored in
324 |  * log_tree items when the files contain unsorted entries written by
325 |  * mounts during their transactions.  Sorted files of increasing size
326 |  * are kept in a btree off the super for searching and further
327 |  * compacting.
328 |  */
329 | struct scoutfs_srch_entry {
330 | 	__le64 hash;
331 | 	__le64 ino;
332 | 	__le64 id;
333 | };
334 | 
335 | #define SCOUTFS_SRCH_ENTRY_MAX_BYTES	(2 + (sizeof(__u64) * 3))
336 | 
337 | struct scoutfs_srch_ref {
338 | 	__le64 blkno;
339 | 	__le64 seq;
340 | };
341 | 
342 | struct scoutfs_srch_file {
343 | 	struct scoutfs_srch_entry first;
344 | 	struct scoutfs_srch_entry last;
345 | 	struct scoutfs_srch_ref ref;
346 | 	__le64 blocks;
347 | 	__le64 entries;
348 | 	__u8 height;
349 | 	__u8 __pad[7];
350 | };
351 | 
352 | struct scoutfs_srch_parent {
353 | 	struct scoutfs_block_header hdr;
354 | 	struct scoutfs_srch_ref refs[0];
355 | };
356 | 
357 | #define SCOUTFS_SRCH_PARENT_REFS				\
358 | 	((SCOUTFS_BLOCK_LG_SIZE -				\
359 | 	  offsetof(struct scoutfs_srch_parent, refs)) /		\
360 | 	 sizeof(struct scoutfs_srch_ref))
361 | 
362 | struct scoutfs_srch_block {
363 | 	struct scoutfs_block_header hdr;
364 | 	struct scoutfs_srch_entry first;
365 | 	struct scoutfs_srch_entry last;
366 | 	struct scoutfs_srch_entry tail;
367 | 	__le32 entry_nr;
368 | 	__le32 entry_bytes;
369 | 	__u8 entries[0];
370 | };
371 | 
372 | /*
373 |  * Decoding loads final small deltas with full __u64 loads.  Rather than
374 |  * check the size before each load we stop coding entries past the point
375 |  * where a full size entry could overflow the block.  A final entry can
376 |  * start at this byte count and consume the rest of the block, though
377 |  * its unlikely.
378 |  */
379 | #define SCOUTFS_SRCH_BLOCK_SAFE_BYTES					\
380 | 	(SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_srch_block) -	\
381 | 	 SCOUTFS_SRCH_ENTRY_MAX_BYTES)
382 | 
383 | #define SCOUTFS_SRCH_LOG_BLOCK_LIMIT	(1024 * 1024 / SCOUTFS_BLOCK_LG_SIZE)
384 | #define SCOUTFS_SRCH_COMPACT_ORDER	2
385 | #define SCOUTFS_SRCH_COMPACT_NR		(1 << SCOUTFS_SRCH_COMPACT_ORDER)
386 | 
387 | /*
388 |  * A persistent record of a srch file compaction operation in progress.
389 |  *
390 |  * When compacting log files blk and pos aren't used.  When compacting
391 |  * sorted files blk is the logical block number and pos is the byte
392 |  * offset of the next entry.  When deleting files pos is the height of
393 |  * the level that we're deleting, and blk is the logical block offset of
394 |  * the next parent ref array index to descend through.
395 |  */
396 | struct scoutfs_srch_compact {
397 | 	struct scoutfs_alloc_list_head meta_avail;
398 | 	struct scoutfs_alloc_list_head meta_freed;
399 | 	__le64 id;
400 | 	__u8 nr;
401 | 	__u8 flags;
402 | 	__u8 __pad[6];
403 | 	struct scoutfs_srch_file out;
404 | 	struct scoutfs_srch_compact_input {
405 | 		struct scoutfs_srch_file sfl;
406 | 		__le64 blk;
407 | 		__le64 pos;
408 | 	} in[SCOUTFS_SRCH_COMPACT_NR];
409 | };
410 | 
411 | /* server -> client: combine input log file entries into output file */
412 | #define SCOUTFS_SRCH_COMPACT_FLAG_LOG		(1 << 0)
413 | /* server -> client: combine input sorted file entries into output file */
414 | #define SCOUTFS_SRCH_COMPACT_FLAG_SORTED	(1 << 1)
415 | /* server -> client: delete input files */
416 | #define SCOUTFS_SRCH_COMPACT_FLAG_DELETE	(1 << 2)
417 | /* client -> server: compaction phase (LOG,SORTED,DELETE) done */
418 | #define SCOUTFS_SRCH_COMPACT_FLAG_DONE		(1 << 4)
419 | /* client -> server: compaction failed */
420 | #define SCOUTFS_SRCH_COMPACT_FLAG_ERROR		(1 << 5)
421 | 
422 | /*
423 |  * XXX I imagine we should rename these now that they've evolved to track
424 |  * all the btrees that clients use during a transaction.  It's not just
425 |  * about item logs, it's about clients making changes to trees.
426 |  */
427 | struct scoutfs_log_trees {
428 | 	struct scoutfs_alloc_list_head meta_avail;
429 | 	struct scoutfs_alloc_list_head meta_freed;
430 | 	struct scoutfs_btree_root item_root;
431 | 	struct scoutfs_btree_ref bloom_ref;
432 | 	struct scoutfs_alloc_root data_avail;
433 | 	struct scoutfs_alloc_root data_freed;
434 | 	struct scoutfs_srch_file srch_file;
435 | 	__le64 max_item_vers;
436 | 	__le64 rid;
437 | 	__le64 nr;
438 | };
439 | 
440 | struct scoutfs_log_item_value {
441 | 	__le64 vers;
442 | 	__u8 flags;
443 | 	__u8 __pad[7];
444 | 	__u8 data[0];
445 | };
446 | 
447 | /*
448 |  * FS items are limited by the max btree value length with the log item
449 |  * value header.
450 |  */
451 | #define SCOUTFS_MAX_VAL_SIZE \
452 | 	(SCOUTFS_BTREE_MAX_VAL_LEN - sizeof(struct scoutfs_log_item_value))
453 | 
454 | #define SCOUTFS_LOG_ITEM_FLAG_DELETION		(1 << 0)
455 | 
456 | struct scoutfs_bloom_block {
457 | 	struct scoutfs_block_header hdr;
458 | 	__le64 total_set;
459 | 	__le64 bits[0];
460 | };
461 | 
462 | /*
463 |  * Item log trees are accompanied by a block of bits that make up a
464 |  * bloom filter which indicate if the item log trees may contain items
465 |  * covered by a lock.  The log trees should be finalized and merged long
466 |  * before the bloom filters fill up and start returning excessive false
467 |  * positives.
468 |  */
469 | #define SCOUTFS_FOREST_BLOOM_NRS		3
470 | #define SCOUTFS_FOREST_BLOOM_BITS \
471 | 	(((SCOUTFS_BLOCK_LG_SIZE - sizeof(struct scoutfs_bloom_block)) /  \
472 | 	 member_sizeof(struct scoutfs_bloom_block, bits[0])) *		  \
473 | 	 member_sizeof(struct scoutfs_bloom_block, bits[0]) * 8)
474 | #define SCOUTFS_FOREST_BLOOM_FUNC_BITS		(SCOUTFS_BLOCK_LG_SHIFT + 3)
475 | 
476 | /*
477 |  * Keys are first sorted by major key zones.
478 |  */
479 | #define SCOUTFS_INODE_INDEX_ZONE		1
480 | #define SCOUTFS_RID_ZONE			2
481 | #define SCOUTFS_FS_ZONE				3
482 | #define SCOUTFS_LOCK_ZONE			4
483 | /* Items only stored in server btrees */
484 | #define SCOUTFS_LOG_TREES_ZONE			6
485 | #define SCOUTFS_LOCK_CLIENTS_ZONE		7
486 | #define SCOUTFS_TRANS_SEQ_ZONE			8
487 | #define SCOUTFS_MOUNTED_CLIENT_ZONE		9
488 | #define SCOUTFS_SRCH_ZONE			10
489 | #define SCOUTFS_FREE_EXTENT_ZONE		11
490 | 
491 | /* inode index zone */
492 | #define SCOUTFS_INODE_INDEX_META_SEQ_TYPE	1
493 | #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE	2
494 | #define SCOUTFS_INODE_INDEX_NR			3 /* don't forget to update */
495 | 
496 | /* rid zone (also used in server alloc btree) */
497 | #define SCOUTFS_ORPHAN_TYPE			1
498 | 
499 | /* fs zone */
500 | #define SCOUTFS_INODE_TYPE			1
501 | #define SCOUTFS_XATTR_TYPE			2
502 | #define SCOUTFS_DIRENT_TYPE			3
503 | #define SCOUTFS_READDIR_TYPE			4
504 | #define SCOUTFS_LINK_BACKREF_TYPE		5
505 | #define SCOUTFS_SYMLINK_TYPE			6
506 | #define SCOUTFS_DATA_EXTENT_TYPE		7
507 | 
508 | /* lock zone, only ever found in lock ranges, never in persistent items */
509 | #define SCOUTFS_RENAME_TYPE			1
510 | 
511 | /* srch zone, only in server btrees */
512 | #define SCOUTFS_SRCH_LOG_TYPE		1
513 | #define SCOUTFS_SRCH_BLOCKS_TYPE	2
514 | #define SCOUTFS_SRCH_PENDING_TYPE	3
515 | #define SCOUTFS_SRCH_BUSY_TYPE		4
516 | 
517 | /* free extents in allocator btrees in client and server, by blkno or len */
518 | #define SCOUTFS_FREE_EXTENT_BLKNO_TYPE	1
519 | #define SCOUTFS_FREE_EXTENT_LEN_TYPE	2
520 | 
521 | /* file data extents have start and len in key */
522 | struct scoutfs_data_extent_val {
523 | 	__le64 blkno;
524 | 	__u8 flags;
525 | 	__u8 __pad[7];
526 | };
527 | 
528 | #define SEF_OFFLINE	(1 << 0)
529 | #define SEF_UNWRITTEN	(1 << 1)
530 | #define SEF_UNKNOWN	(U8_MAX << 2)
531 | 
532 | /*
533 |  * The first xattr part item has a header that describes the xattr.  The
534 |  * name and value are then packed into the following bytes in the first
535 |  * part item and overflow into the values of the rest of the part items.
536 |  */
537 | struct scoutfs_xattr {
538 | 	__le16 val_len;
539 | 	__u8 name_len;
540 | 	__u8 __pad[5];
541 | 	__u8 name[0];
542 | };
543 | 
544 | 
545 | /* XXX does this exist upstream somewhere? */
546 | #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER))
547 | 
548 | #define SCOUTFS_UUID_BYTES 16
549 | 
550 | /*
551 |  * Mounts read all the quorum blocks and write to one random quorum
552 |  * block during a cycle.  The min cycle time limits the per-mount iop
553 |  * load during elections.  The random cycle delay makes it less likely
554 |  * that mounts will read and write at the same time and miss each
555 |  * other's writes.  An election only completes if a quorum of mounts
556 |  * vote for a leader before any of their elections timeout.  This is
557 |  * made less likely by the probability that mounts will overwrite each
558 |  * others random block locations.  The max quorum count limits that
559 |  * probability.  9 mounts only have a 55% chance of writing to unique 4k
560 |  * blocks in a 256k region.  The election timeout is set to include
561 |  * enough cycles to usually complete the election.  Once a leader is
562 |  * elected it spends a number of cycles writing out blocks with itself
563 |  * logged as a leader.  This reduces the possibility that servers
564 |  * will have their log entries overwritten and not be fenced.
565 |  */
566 | #define SCOUTFS_QUORUM_MAX_COUNT		9
567 | #define SCOUTFS_QUORUM_CYCLE_LO_MS		10
568 | #define SCOUTFS_QUORUM_CYCLE_HI_MS		20
569 | #define SCOUTFS_QUORUM_TERM_LO_MS		250
570 | #define SCOUTFS_QUORUM_TERM_HI_MS		500
571 | #define SCOUTFS_QUORUM_ELECTED_LOG_CYCLES	10
572 | 
573 | struct scoutfs_quorum_block {
574 | 	__le64 fsid;
575 | 	__le64 blkno;
576 | 	__le64 term;
577 | 	__le64 write_nr;
578 | 	__le64 voter_rid;
579 | 	__le64 vote_for_rid;
580 | 	__le32 crc;
581 | 	__u8 log_nr;
582 | 	__u8 __pad[3];
583 | 	struct scoutfs_quorum_log {
584 | 		__le64 term;
585 | 		__le64 rid;
586 | 		struct scoutfs_inet_addr addr;
587 | 	} log[0];
588 | };
589 | 
590 | #define SCOUTFS_QUORUM_LOG_MAX						  \
591 | 	((SCOUTFS_BLOCK_SM_SIZE - sizeof(struct scoutfs_quorum_block)) /  \
592 | 		sizeof(struct scoutfs_quorum_log))
593 | 
594 | #define SCOUTFS_FLAG_IS_META_BDEV 0x01
595 | 
596 | struct scoutfs_super_block {
597 | 	struct scoutfs_block_header hdr;
598 | 	__le64 id;
599 | 	__le64 format_hash;
600 | 	__le64 flags;
601 | 	__u8 uuid[SCOUTFS_UUID_BYTES];
602 | 	__le64 next_ino;
603 | 	__le64 next_trans_seq;
604 | 	__le64 total_meta_blocks;	/* both static and dynamic */
605 | 	__le64 first_meta_blkno;	/* first dynamically allocated */
606 | 	__le64 last_meta_blkno;
607 | 	__le64 total_data_blocks;
608 | 	__le64 first_data_blkno;
609 | 	__le64 last_data_blkno;
610 | 	__le64 quorum_fenced_term;
611 | 	__le64 quorum_server_term;
612 | 	__le64 unmount_barrier;
613 | 	__u8 quorum_count;
614 | 	__u8 __pad[7];
615 | 	struct scoutfs_inet_addr server_addr;
616 | 	struct scoutfs_alloc_root meta_alloc[2];
617 | 	struct scoutfs_alloc_root data_alloc;
618 | 	struct scoutfs_alloc_list_head server_meta_avail[2];
619 | 	struct scoutfs_alloc_list_head server_meta_freed[2];
620 | 	struct scoutfs_btree_root fs_root;
621 | 	struct scoutfs_btree_root logs_root;
622 | 	struct scoutfs_btree_root lock_clients;
623 | 	struct scoutfs_btree_root trans_seqs;
624 | 	struct scoutfs_btree_root mounted_clients;
625 | 	struct scoutfs_btree_root srch_root;
626 | };
627 | 
628 | #define SCOUTFS_ROOT_INO 1
629 | 
630 | 
631 | /*
632 |  * @meta_seq: advanced the first time an inode is updated in a given
633 |  * transaction.  It can only advance again after the inode is written
634 |  * and a new transaction opens.
635 |  *
636 |  * @data_seq: advanced the first time a file's data (or size) is
637 |  * modified in a given transaction.  It can only advance again after the
638 |  * file is written and a new transaction opens.
639 |  *
640 |  * @data_version: incremented every time the contents of a file could
641 |  * have changed.  It is exposed via an ioctl and is then provided as an
642 |  * argument to data functions to protect racing modification.
643 |  *
644 |  * @online_blocks: The number of fixed 4k blocks currently allocated and
645 |  * storing data in the volume.
646 |  *
647 |  * @offline_blocks: The number of fixed 4k blocks that could be made
648 |  * online by staging.
649 |  *
650 |  * XXX
651 |  *	- otime?
652 |  *	- compat flags?
653 |  *	- version?
654 |  *	- generation?
655 |  *	- be more careful with rdev?
656 |  */
657 | struct scoutfs_inode {
658 | 	__le64 size;
659 | 	__le64 meta_seq;
660 | 	__le64 data_seq;
661 | 	__le64 data_version;
662 | 	__le64 online_blocks;
663 | 	__le64 offline_blocks;
664 | 	__le64 next_readdir_pos;
665 | 	__le64 next_xattr_id;
666 | 	__le32 nlink;
667 | 	__le32 uid;
668 | 	__le32 gid;
669 | 	__le32 mode;
670 | 	__le32 rdev;
671 | 	__le32 flags;
672 | 	struct scoutfs_timespec atime;
673 | 	struct scoutfs_timespec ctime;
674 | 	struct scoutfs_timespec mtime;
675 | };
676 | 
677 | #define SCOUTFS_INO_FLAG_TRUNCATE 0x1
678 | 
679 | #define SCOUTFS_ROOT_INO 1
680 | 
681 | /* like the block size, a reasonable min PATH_MAX across platforms */
682 | #define SCOUTFS_SYMLINK_MAX_SIZE 4096
683 | 
684 | /*
685 |  * Dirents are stored in multiple places to isolate contention when
686 |  * performing different operations: hashed by name for creation and
687 |  * lookup, at incrementing positions for readdir and resolving inodes to
688 |  * paths.  Each entry has all the metadata needed to reference all the
689 |  * items (so an entry cached by lookup can be used to unlink all the
690 |  * items).
691 |  */
692 | struct scoutfs_dirent {
693 | 	__le64 ino;
694 | 	__le64 hash;
695 | 	__le64 pos;
696 | 	__u8 type;
697 | 	__u8 __pad[7];
698 | 	__u8 name[0];
699 | };
700 | 
701 | #define SCOUTFS_NAME_LEN 255
702 | 
703 | /* S32_MAX avoids the (int) sign bit and might avoid sloppy bugs */
704 | #define SCOUTFS_LINK_MAX S32_MAX
705 | 
706 | /* entries begin after . and .. */
707 | #define SCOUTFS_DIRENT_FIRST_POS 2
708 | /* getdents returns next pos with an entry, no entry at (f_pos)~0 */
709 | #define SCOUTFS_DIRENT_LAST_POS (U64_MAX - 1)
710 | 
711 | enum scoutfs_dentry_type {
712 | 	SCOUTFS_DT_FIFO = 0,
713 | 	SCOUTFS_DT_CHR,
714 | 	SCOUTFS_DT_DIR,
715 | 	SCOUTFS_DT_BLK,
716 | 	SCOUTFS_DT_REG,
717 | 	SCOUTFS_DT_LNK,
718 | 	SCOUTFS_DT_SOCK,
719 | 	SCOUTFS_DT_WHT,
720 | };
721 | 
722 | 
723 | #define SCOUTFS_XATTR_MAX_NAME_LEN	255
724 | #define SCOUTFS_XATTR_MAX_VAL_LEN	65535
725 | #define SCOUTFS_XATTR_MAX_PART_SIZE	SCOUTFS_MAX_VAL_SIZE
726 | 
727 | #define SCOUTFS_XATTR_NR_PARTS(name_len, val_len)			\
728 | 	DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \
729 | 		     (unsigned int)SCOUTFS_XATTR_MAX_PART_SIZE)
730 | 
731 | #define SCOUTFS_LOCK_INODE_GROUP_NR	1024
732 | #define SCOUTFS_LOCK_INODE_GROUP_MASK	(SCOUTFS_LOCK_INODE_GROUP_NR - 1)
733 | #define SCOUTFS_LOCK_SEQ_GROUP_MASK	((1ULL << 10) - 1)
734 | 
735 | /*
736 |  * messages over the wire.
737 |  */
738 | 
739 | /*
740 |  * Greetings verify identity of communicating nodes.  The sender sends
741 |  * their credentials and the receiver verifies them.
742 |  *
743 |  * @server_term: The raft term that elected the server.  Initially 0
744 |  * from the client, sent by the server, then sent by the client as it
745 |  * tries to reconnect.  Used to identify a client reconnecting to both
746 |  * the same serer after receiving a greeting response and to a new
747 |  * server after failover.
748 |  *
749 |  * @unmount_barrier: Incremented every time the remaining majority of
750 |  * quorum members all agree to leave.  The server tells a quorum member
751 |  * the value that it's connecting under so that if the client sees the
752 |  * value increase in the super block then it knows that the server has
753 |  * processed its farewell and can safely unmount.
754 |  *
755 |  * @rid: The client's random id that was generated once as the mount
756 |  * started up.  This identifies a specific remote mount across
757 |  * connections and servers.  It's set to the client's rid in both the
758 |  * request and response for consistency.
759 |  */
760 | struct scoutfs_net_greeting {
761 | 	__le64 fsid;
762 | 	__le64 format_hash;
763 | 	__le64 server_term;
764 | 	__le64 unmount_barrier;
765 | 	__le64 rid;
766 | 	__le64 flags;
767 | };
768 | 
769 | #define SCOUTFS_NET_GREETING_FLAG_FAREWELL	(1 << 0)
770 | #define SCOUTFS_NET_GREETING_FLAG_VOTER		(1 << 1)
771 | #define SCOUTFS_NET_GREETING_FLAG_INVALID	(~(__u64)0 << 2)
772 | 
773 | /*
774 |  * This header precedes and describes all network messages sent over
775 |  * sockets.
776 |  *
777 |  * @seq: A sequence number that is increased for each message queued for
778 |  * send on the sender.  The sender will never reorder messages in the
779 |  * send queue so this will always increase in recv on the receiver.  The
780 |  * receiver can use this to drop messages that arrived twice after being
781 |  * resent across a newly connected socket for a given connection.
782 |  *
783 |  * @recv_seq: The sequence number of the last received message.  The
784 |  * receiver is sending this to the sender in every message.  The sender
785 |  * uses them to drop responses which have been delivered.
786 |  *
787 |  * @id: An increasing identifier that is set in each request.  Responses
788 |  * specify the request that they're responding to.
789 |  *
790 |  * Error is only set to a translated errno and will only be found in
791 |  * response messages.
792 |  */
793 | struct scoutfs_net_header {
794 | 	__le64 clock_sync_id;
795 | 	__le64 seq;
796 | 	__le64 recv_seq;
797 | 	__le64 id;
798 | 	__le16 data_len;
799 | 	__u8 cmd;
800 | 	__u8 flags;
801 | 	__u8 error;
802 | 	__u8 __pad[3];
803 | 	__u8 data[0];
804 | };
805 | 
806 | #define SCOUTFS_NET_FLAG_RESPONSE	(1 << 0)
807 | #define SCOUTFS_NET_FLAGS_UNKNOWN	(U8_MAX << 1)
808 | 
809 | enum scoutfs_net_cmd {
810 | 	SCOUTFS_NET_CMD_GREETING = 0,
811 | 	SCOUTFS_NET_CMD_ALLOC_INODES,
812 | 	SCOUTFS_NET_CMD_GET_LOG_TREES,
813 | 	SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
814 | 	SCOUTFS_NET_CMD_GET_ROOTS,
815 | 	SCOUTFS_NET_CMD_ADVANCE_SEQ,
816 | 	SCOUTFS_NET_CMD_GET_LAST_SEQ,
817 | 	SCOUTFS_NET_CMD_LOCK,
818 | 	SCOUTFS_NET_CMD_LOCK_RECOVER,
819 | 	SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
820 | 	SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
821 | 	SCOUTFS_NET_CMD_FAREWELL,
822 | 	SCOUTFS_NET_CMD_UNKNOWN,
823 | };
824 | 
825 | /*
826 |  * Define a macro to evaluate another macro for each of the errnos we
827 |  * translate over the wire.  This lets us keep our enum in sync with the
828 |  * mapping arrays to and from host errnos.
829 |  */
830 | #define EXPAND_EACH_NET_ERRNO		\
831 | 	EXPAND_NET_ERRNO(ENOENT)	\
832 | 	EXPAND_NET_ERRNO(ENOMEM)	\
833 | 	EXPAND_NET_ERRNO(EIO)		\
834 | 	EXPAND_NET_ERRNO(ENOSPC)	\
835 | 	EXPAND_NET_ERRNO(EINVAL)
836 | 
837 | #undef EXPAND_NET_ERRNO
838 | #define EXPAND_NET_ERRNO(which) SCOUTFS_NET_ERR_##which,
839 | enum scoutfs_net_errors {
840 | 	SCOUTFS_NET_ERR_NONE = 0,
841 | 	EXPAND_EACH_NET_ERRNO
842 | 	SCOUTFS_NET_ERR_UNKNOWN,
843 | };
844 | 
845 | /* arbitrarily chosen to be safely less than mss and allow 1k with header */
846 | #define SCOUTFS_NET_MAX_DATA_LEN 1100
847 | 
848 | /*
849 |  * When there's no more free inodes this will be sent with ino = ~0 and
850 |  * nr = 0.
851 |  */
852 | struct scoutfs_net_inode_alloc {
853 | 	__le64 ino;
854 | 	__le64 nr;
855 | };
856 | 
857 | struct scoutfs_net_roots {
858 | 	struct scoutfs_btree_root fs_root;
859 | 	struct scoutfs_btree_root logs_root;
860 | 	struct scoutfs_btree_root srch_root;
861 | };
862 | 
863 | struct scoutfs_net_lock {
864 | 	struct scoutfs_key key;
865 | 	__le64 write_version;
866 | 	__u8 old_mode;
867 | 	__u8 new_mode;
868 | 	__u8 __pad[6];
869 | };
870 | 
871 | struct scoutfs_net_lock_grant_response {
872 | 	struct scoutfs_net_lock nl;
873 | 	struct scoutfs_net_roots roots;
874 | };
875 | 
876 | struct scoutfs_net_lock_recover {
877 | 	__le16 nr;
878 | 	__u8 __pad[6];
879 | 	struct scoutfs_net_lock locks[0];
880 | };
881 | 
882 | #define SCOUTFS_NET_LOCK_MAX_RECOVER_NR					       \
883 | 	((SCOUTFS_NET_MAX_DATA_LEN - sizeof(struct scoutfs_net_lock_recover)) /\
884 | 	 sizeof(struct scoutfs_net_lock))
885 | 
886 | /* some enums for tracing */
887 | enum scoutfs_lock_trace {
888 | 	SLT_CLIENT,
889 | 	SLT_SERVER,
890 | 	SLT_GRANT,
891 | 	SLT_INVALIDATE,
892 | 	SLT_REQUEST,
893 | 	SLT_RESPONSE,
894 | };
895 | 
896 | /*
897 |  * Read and write locks operate as you'd expect.  Multiple readers can
898 |  * hold read locks while writers are excluded.  A single writer can hold
899 |  * a write lock which excludes other readers and writers.  Writers can
900 |  * read while holding a write lock.
901 |  *
902 |  * Multiple writers can hold write only locks but they can not read,
903 |  * they can only generate dirty items.  It's used when the system has
904 |  * other means of knowing that it's safe to overwrite items.
905 |  *
906 |  * The null mode provides no access and is used to destroy locks.
907 |  */
908 | enum scoutfs_lock_mode {
909 | 	SCOUTFS_LOCK_NULL = 0,
910 | 	SCOUTFS_LOCK_READ,
911 | 	SCOUTFS_LOCK_WRITE,
912 | 	SCOUTFS_LOCK_WRITE_ONLY,
913 | 	SCOUTFS_LOCK_INVALID,
914 | };
915 | 
916 | /*
917 |  * Scoutfs file handle structure - this can be copied out to userspace
918 |  * via open by handle or put on the wire from NFS.
919 |  */
920 | struct scoutfs_fid {
921 | 	__le64 ino;
922 | 	__le64 parent_ino;
923 | };
924 | 
925 | #define FILEID_SCOUTFS			0x81
926 | #define FILEID_SCOUTFS_WITH_PARENT	0x82
927 | 
928 | /*
929 |  * Identifiers for sources of corruption that can generate messages.
930 |  */
931 | enum scoutfs_corruption_sources {
932 | 	SC_DIRENT_NAME_LEN = 0,
933 | 	SC_DIRENT_BACKREF_NAME_LEN,
934 | 	SC_DIRENT_READDIR_NAME_LEN,
935 | 	SC_SYMLINK_INODE_SIZE,
936 | 	SC_SYMLINK_MISSING_ITEM,
937 | 	SC_SYMLINK_NOT_NULL_TERM,
938 | 	SC_BTREE_BLOCK_LEVEL,
939 | 	SC_BTREE_NO_CHILD_REF,
940 | 	SC_INODE_BLOCK_COUNTS,
941 | 	SC_NR_SOURCES,
942 | };
943 | 
944 | #define SC_NR_LONGS DIV_ROUND_UP(SC_NR_SOURCES, BITS_PER_LONG)
945 | 
946 | #endif
947 | 


--------------------------------------------------------------------------------
/src/hash.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SCOUTFS_HASH_H_
 2 | #define _SCOUTFS_HASH_H_
 3 | 
 4 | /*
 5 |  * We're using FNV1a for now.  It's fine.  Ish.
 6 |  *
 7 |  * The longer term plan is xxh3 but it looks like it'll take just a bit
 8 |  * more time to be declared stable and then it needs to be ported to the
 9 |  * kernel.
10 |  *
11 |  *  - https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
12 |  *  - https://github.com/Cyan4973/xxHash/releases/tag/v0.7.4
13 |  */
14 | 
15 | static inline u32 fnv1a32(const void *data, unsigned int len)
16 | {
17 | 	u32 hash = 0x811c9dc5;
18 | 
19 | 	while (len--) {
20 | 		hash ^= *(u8 *)(data++);
21 | 		hash *= 0x01000193;
22 | 	}
23 | 
24 | 	return hash;
25 | }
26 | 
27 | static inline u64 fnv1a64(const void *data, unsigned int len)
28 | {
29 | 	u64 hash = 0xcbf29ce484222325ULL;
30 | 
31 | 	while (len--) {
32 | 		hash ^= *(u8 *)(data++);
33 | 		hash *= 0x100000001b3ULL;
34 | 	}
35 | 
36 | 	return hash;
37 | }
38 | 
39 | static inline u32 scoutfs_hash32(const void *data, unsigned int len)
40 | {
41 | 	return fnv1a32(data, len);
42 | }
43 | 
44 | static inline u64 scoutfs_hash64(const void *data, unsigned int len)
45 | {
46 | 	return fnv1a64(data, len);
47 | }
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/ino_path.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <limits.h>
 11 | 
 12 | #include "sparse.h"
 13 | #include "util.h"
 14 | #include "format.h"
 15 | #include "ioctl.h"
 16 | #include "cmd.h"
 17 | 
 18 | static int ino_path_cmd(int argc, char **argv)
 19 | {
 20 | 	struct scoutfs_ioctl_ino_path args;
 21 | 	struct scoutfs_ioctl_ino_path_result *res;
 22 | 	unsigned int result_bytes;
 23 | 	char *endptr = NULL;
 24 | 	u64 ino;
 25 | 	int ret;
 26 | 	int fd;
 27 | 
 28 | 	if (argc != 3) {
 29 | 		fprintf(stderr, "must specify ino and path\n");
 30 | 		return -EINVAL;
 31 | 	}
 32 | 
 33 | 	ino = strtoull(argv[1], &endptr, 0);
 34 | 	if (*endptr != '\0' ||
 35 | 	    ((ino == LLONG_MIN || ino == LLONG_MAX) && errno == ERANGE)) {
 36 | 		fprintf(stderr, "error parsing inode number '%s'\n",
 37 | 			argv[1]);
 38 | 		return -EINVAL;
 39 | 	}
 40 | 
 41 | 
 42 | 	fd = open(argv[2], O_RDONLY);
 43 | 	if (fd < 0) {
 44 | 		ret = -errno;
 45 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 46 | 			argv[2], strerror(errno), errno);
 47 | 		return ret;
 48 | 	}
 49 | 
 50 | 	result_bytes = offsetof(struct scoutfs_ioctl_ino_path_result,
 51 | 				path[PATH_MAX]);
 52 | 	res = malloc(result_bytes);
 53 | 	if (!res) {
 54 | 		fprintf(stderr, "couldn't allocate %u byte buffer\n",
 55 | 			result_bytes);
 56 | 		ret = -ENOMEM;
 57 | 		goto out;
 58 | 	}
 59 | 
 60 | 	args.ino = ino;
 61 | 	args.dir_ino = 0;
 62 | 	args.dir_pos = 0;
 63 | 	args.result_ptr = (intptr_t)res;
 64 | 	args.result_bytes = result_bytes;
 65 | 	for (;;) {
 66 | 		ret = ioctl(fd, SCOUTFS_IOC_INO_PATH, &args);
 67 | 		if (ret < 0) {
 68 | 			ret = -errno;
 69 | 			if (ret == -ENOENT)
 70 | 				ret = 0;
 71 | 			break;
 72 | 		}
 73 | 
 74 | 		printf("%.*s\n", res->path_bytes, res->path);
 75 | 
 76 | 		args.dir_ino = res->dir_ino;
 77 | 		args.dir_pos = res->dir_pos;
 78 | 		if (++args.dir_pos == 0) {
 79 | 			if (++args.dir_ino == 0)
 80 | 				break;
 81 | 		}
 82 | 	}
 83 | 
 84 | 	if (ret < 0) {
 85 | 		ret = -errno;
 86 | 		fprintf(stderr, "inodes_since ioctl failed: %s (%d)\n",
 87 | 			strerror(errno), errno);
 88 | 	}
 89 | out:
 90 | 	free(res);
 91 | 	close(fd);
 92 | 	return ret;
 93 | };
 94 | 
 95 | static void __attribute__((constructor)) ino_path_ctor(void)
 96 | {
 97 | 	cmd_register("ino-path", "<ino> <path>",
 98 | 		     "print paths that refer to inode #", ino_path_cmd);
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/ioctl.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SCOUTFS_IOCTL_H_
  2 | #define _SCOUTFS_IOCTL_H_
  3 | 
  4 | /*
  5 |  * We naturally align explicit width fields in the ioctl structs so that
  6 |  * userspace doesn't need to deal with padding or unaligned packing and
  7 |  * we don't have to deal with 32/64 compat.  It makes it a little
  8 |  * awkward to communicate persistent packed structs through the ioctls
  9 |  * but that happens very rarely.  An interesting special case are
 10 |  * 0length arrays that follow the structs.  We make those start at the
 11 |  * next aligned offset of the struct to be safe.
 12 |  *
 13 |  * This is enforced by pahole scripting in external build environments.
 14 |  */
 15 | 
 16 | /* XXX I have no idea how these are chosen. */
 17 | #define SCOUTFS_IOCTL_MAGIC 's'
 18 | 
 19 | /*
 20 |  * Packed scoutfs keys rarely cross the ioctl boundary so we have a
 21 |  * translation struct.
 22 |  */
 23 | struct scoutfs_ioctl_key {
 24 | 	__le64	_sk_first;
 25 | 	__le64	_sk_second;
 26 | 	__le64	_sk_third;
 27 | 	__u8	_sk_fourth;
 28 | 	__u8	sk_type;
 29 | 	__u8	sk_zone;
 30 | 	__u8	_pad[5];
 31 | };
 32 | 
 33 | struct scoutfs_ioctl_walk_inodes_entry {
 34 | 	__u64 major;
 35 | 	__u64 ino;
 36 | 	__u32 minor;
 37 | 	__u8  _pad[4];
 38 | };
 39 | 
 40 | /*
 41 |  * Walk inodes in an index that is sorted by one of their fields.
 42 |  *
 43 |  * Each index is built from generic index items that have major and
 44 |  * minor values that are set to the field being indexed.  In time
 45 |  * indices, for example, major is seconds and minor is nanoseconds.
 46 |  *
 47 |  * @first       The first index entry that can be returned.
 48 |  * @last        The last index entry that can be returned.
 49 |  * @entries_ptr Pointer to emory containing buffer for entry results.
 50 |  * @nr_entries  The number of entries that can fit in the buffer.
 51 |  * @index       Which index to walk, enumerated in _WALK_INODES_ constants.
 52 |  *
 53 |  * To start iterating first can be memset to 0 and last to 0xff.  Then
 54 |  * after each set of results first can be set to the last entry returned
 55 |  * and then the fields can be incremented in reverse sort order (ino <
 56 |  * minor < major) as each increasingly significant value wraps around to
 57 |  * 0.
 58 |  *
 59 |  * These indexes are not strictly consistent.  The items that back these
 60 |  * index entries aren't updated with cluster locks so they're not
 61 |  * guaranteed to be visible the moment you read after writing.  They're
 62 |  * only visible when the transaction that updated them is synced.
 63 |  *
 64 |  * In addition, the seq indexes will only allow walking through sequence
 65 |  * space that has been consistent.  This prevents old dirty entries from
 66 |  * becoming visible after newer stable entries are displayed.
 67 |  *
 68 |  * If first is greater than last then the walk will return 0 entries.
 69 |  *
 70 |  * XXX invalidate before reading.
 71 |  */
 72 | struct scoutfs_ioctl_walk_inodes {
 73 | 	struct scoutfs_ioctl_walk_inodes_entry first;
 74 | 	struct scoutfs_ioctl_walk_inodes_entry last;
 75 | 	__u64 entries_ptr;
 76 | 	__u32 nr_entries;
 77 | 	__u8 index;
 78 | 	__u8 _pad[11]; /* padded to align walk_inodes_entry total size */
 79 | };
 80 | 
 81 | enum scoutfs_ino_walk_seq_type {
 82 | 	SCOUTFS_IOC_WALK_INODES_META_SEQ = 0,
 83 | 	SCOUTFS_IOC_WALK_INODES_DATA_SEQ,
 84 | 	SCOUTFS_IOC_WALK_INODES_UNKNOWN,
 85 | };
 86 | 
 87 | /*
 88 |  * Adds entries to the user's buffer for each inode that is found in the
 89 |  * given index between the first and last positions.
 90 |  */
 91 | #define SCOUTFS_IOC_WALK_INODES _IOR(SCOUTFS_IOCTL_MAGIC, 1, \
 92 | 				     struct scoutfs_ioctl_walk_inodes)
 93 | 
 94 | /*
 95 |  * Fill the result buffer with the next absolute path to the target
 96 |  * inode searching from a given position in a parent directory.
 97 |  *
 98 |  * @ino: The target ino that we're finding paths to.  Constant across
 99 |  * all the calls that make up an iteration over all the inode's paths.
100 |  *
101 |  * @dir_ino: The inode number of the directory containing the entry to
102 |  * our inode to search from.  If this parent directory contains no more
103 |  * entries to our inode then we'll search through other parent directory
104 |  * inodes in inode order.
105 |  *
106 |  * @dir_pos: The position in the dir_ino parent directory of the entry
107 |  * to our inode to search from.  If there is no entry at this position
108 |  * then we'll search through other entry positions in increasing order.
109 |  * If we exhaust the parent directory then we'll search through
110 |  * additional parent directories in inode order.
111 |  *
112 |  * @result_ptr: A pointer to the buffer where the result struct and
113 |  * absolute path will be stored.
114 |  *
115 |  * @result_bytes: The size of the buffer that will contain the result
116 |  * struct and the null terminated absolute path name.
117 |  *
118 |  * To start iterating set the desired target inode, dir_ino to 0,
119 |  * dir_pos to 0, and set result_ptr and _bytes to a sufficiently large
120 |  * buffeer (sizeof(result) + PATH_MAX is a solid choice).
121 |  *
122 |  * After each returned result set the next search dir_ino and dir_pos to
123 |  * the returned dir_ino and dir_pos.  Then increment the search dir_pos,
124 |  * and if it wrapped to 0, increment dir_ino.
125 |  *
126 |  * This only walks back through full hard links.  None of the returned
127 |  * paths will reflect symlinks to components in the path.
128 |  *
129 |  * This doesn't ensure that the caller has permissions to traverse the
130 |  * returned paths to the inode.  It requires CAP_DAC_READ_SEARCH which
131 |  * bypasses permissions checking.
132 |  *
133 |  * This call is not serialized with any modification (create, rename,
134 |  * unlink) of the path components.  It will return all the paths that
135 |  * were stable both before and after the call.  It may or may not return
136 |  * paths which are created or unlinked during the call.
137 |  *
138 |  * On success 0 is returned and result struct is filled with the next
139 |  * absolute path.  The path_bytes length of the path includes a null
140 |  * terminating byte.  dir_ino and dir_pos refer to the position of the
141 |  * final component in its parent directory and can be advanced to search
142 |  * for the next terminal entry whose path is then built by walking up
143 |  * parent directories.
144 |  *
145 |  * ENOENT is returned when no paths are found.
146 |  *
147 |  * ENAMETOOLONG is returned when the result struct and path found
148 |  * doesn't fit in the result buffer.
149 |  *
150 |  * Many other errnos indicate hard failure to find the next path.
151 |  */
152 | struct scoutfs_ioctl_ino_path {
153 | 	__u64 ino;
154 | 	__u64 dir_ino;
155 | 	__u64 dir_pos;
156 | 	__u64 result_ptr;
157 | 	__u16 result_bytes;
158 | 	__u8 _pad[6];
159 | };
160 | 
161 | struct scoutfs_ioctl_ino_path_result {
162 | 	__u64 dir_ino;
163 | 	__u64 dir_pos;
164 | 	__u16 path_bytes;
165 | 	__u8  _pad[6];
166 | 	__u8  path[0];
167 | };
168 | 
169 | /* Get a single path from the root to the given inode number */
170 | #define SCOUTFS_IOC_INO_PATH _IOR(SCOUTFS_IOCTL_MAGIC, 2, \
171 | 				  struct scoutfs_ioctl_ino_path)
172 | 
173 | /*
174 |  * "Release" a contiguous range of logical blocks of file data.
175 |  * Released blocks are removed from the file system like truncation, but
176 |  * an offline record is left behind to trigger demand staging if the
177 |  * file is read.
178 |  *
179 |  * The starting block offset and number of blocks to release are in
180 |  * units 4KB blocks.
181 |  *
182 |  * The specified range can extend past i_size and can straddle sparse
183 |  * regions or blocks that are already offline.  The only change it makes
184 |  * is to free and mark offline any existing blocks that intersect with
185 |  * the region.
186 |  *
187 |  * Returns 0 if the operation succeeds.  If an error is returned then
188 |  * some partial region of the blocks in the region may have been marked
189 |  * offline.
190 |  *
191 |  * If the operation succeeds then inode metadata that reflects file data
192 |  * contents are not updated.  This is intended to be transparent to the
193 |  * presentation of the data in the file.
194 |  */
195 | struct scoutfs_ioctl_release {
196 | 	__u64 block;
197 | 	__u64 count;
198 | 	__u64 data_version;
199 | };
200 | 
201 | #define SCOUTFS_IOC_RELEASE _IOW(SCOUTFS_IOCTL_MAGIC, 3, \
202 | 				 struct scoutfs_ioctl_release)
203 | 
204 | struct scoutfs_ioctl_stage {
205 | 	__u64 data_version;
206 | 	__u64 buf_ptr;
207 | 	__u64 offset;
208 | 	__s32 count;
209 | 	__u32 _pad;
210 | };
211 | 
212 | #define SCOUTFS_IOC_STAGE _IOW(SCOUTFS_IOCTL_MAGIC, 4, \
213 | 			       struct scoutfs_ioctl_stage)
214 | 
215 | /*
216 |  * Give the user inode fields that are not otherwise visible.  statx()
217 |  * isn't always available and xattrs are relatively expensive.
218 |  *
219 |  * @valid_bytes stores the number of bytes that are valid in the
220 |  * structure.  The caller sets this to the size of the struct that they
221 |  * understand.  The kernel then fills and copies back the min of the
222 |  * size they and the user caller understand.  The user can tell if a
223 |  * field is set if all of its bytes are within the valid_bytes that the
224 |  * kernel set on return.
225 |  *
226 |  * New fields are only added to the end of the struct.
227 |  */
228 | struct scoutfs_ioctl_stat_more {
229 | 	__u64 valid_bytes;
230 | 	__u64 meta_seq;
231 | 	__u64 data_seq;
232 | 	__u64 data_version;
233 | 	__u64 online_blocks;
234 | 	__u64 offline_blocks;
235 | };
236 | 
237 | #define SCOUTFS_IOC_STAT_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 5, \
238 | 				   struct scoutfs_ioctl_stat_more)
239 | 
240 | 
241 | struct scoutfs_ioctl_data_waiting_entry {
242 | 	__u64 ino;
243 | 	__u64 iblock;
244 | 	__u8 op;
245 | 	__u8 _pad[7];
246 | };
247 | 
248 | #define SCOUTFS_IOC_DWO_READ		(1 << 0)
249 | #define SCOUTFS_IOC_DWO_WRITE		(1 << 1)
250 | #define SCOUTFS_IOC_DWO_CHANGE_SIZE	(1 << 2)
251 | #define SCOUTFS_IOC_DWO_UNKNOWN		(U8_MAX << 3)
252 | 
253 | struct scoutfs_ioctl_data_waiting {
254 | 	__u64 flags;
255 | 	__u64 after_ino;
256 | 	__u64 after_iblock;
257 | 	__u64 ents_ptr;
258 | 	__u16 ents_nr;
259 | 	__u8 _pad[6];
260 | };
261 | 
262 | #define SCOUTFS_IOC_DATA_WAITING_FLAGS_UNKNOWN		(U8_MAX << 0)
263 | 
264 | #define SCOUTFS_IOC_DATA_WAITING _IOR(SCOUTFS_IOCTL_MAGIC, 6, \
265 | 				      struct scoutfs_ioctl_data_waiting)
266 | 
267 | /*
268 |  * If i_size is set then data_version must be non-zero.  If the offline
269 |  * flag is set then i_size must be set and a offline extent will be
270 |  * created from offset 0 to i_size.
271 |  */
272 | struct scoutfs_ioctl_setattr_more {
273 | 	__u64 data_version;
274 | 	__u64 i_size;
275 | 	__u64 flags;
276 | 	__u64 ctime_sec;
277 | 	__u32 ctime_nsec;
278 | 	__u8 _pad[4];
279 | };
280 | 
281 | #define SCOUTFS_IOC_SETATTR_MORE_OFFLINE		(1 << 0)
282 | #define SCOUTFS_IOC_SETATTR_MORE_UNKNOWN		(U8_MAX << 1)
283 | 
284 | #define SCOUTFS_IOC_SETATTR_MORE _IOW(SCOUTFS_IOCTL_MAGIC, 7, \
285 | 				      struct scoutfs_ioctl_setattr_more)
286 | 
287 | struct scoutfs_ioctl_listxattr_hidden {
288 | 	__u64 id_pos;
289 | 	__u64 buf_ptr;
290 | 	__u32 buf_bytes;
291 | 	__u32 hash_pos;
292 | };
293 | 
294 | #define SCOUTFS_IOC_LISTXATTR_HIDDEN _IOR(SCOUTFS_IOCTL_MAGIC, 8, \
295 | 					  struct scoutfs_ioctl_listxattr_hidden)
296 | 
297 | /*
298 |  * Return the inode numbers of inodes which might contain the given
299 |  * xattr.  The inode may not have a set xattr with that name, the caller
300 |  * must check the returned inodes to see if they match.
301 |  *
302 |  * @next_ino: The next inode number that could be returned.  Initialized
303 |  * to 0 when first searching and set to one past the last inode number
304 |  * returned to continue searching.
305 |  * @last_ino: The last inode number that could be returned.  U64_MAX to
306 |  * find all inodes.
307 |  * @name_ptr: The address of the name of the xattr to search for.  It is
308 |  * not null terminated.
309 |  * @inodes_ptr: The address of the array of uint64_t inode numbers in
310 |  * which to store inode numbers that may contain the xattr.  EFAULT may
311 |  * be returned if this address is not naturally aligned.
312 |  * @output_flags: Set as success is returned.  If an error is returned
313 |  * then this field is undefined and should not be read.
314 |  * @nr_inodes: The number of elements in the array found at inodes_ptr.
315 |  * @name_bytes: The number of non-null bytes found in the name at
316 |  * name_ptr.
317 |  *
318 |  * This requires the CAP_SYS_ADMIN capability and will return -EPERM if
319 |  * it's not granted.
320 |  *
321 |  * The number of inode numbers stored in the inodes_ptr array is
322 |  * returned.  If nr_inodes is 0 or last_ino is less than next_ino then 0
323 |  * will be immediately returned.
324 |  *
325 |  * Partial progress can be returned if an error is hit or if nr_inodes
326 |  * was larger than the internal limit on the number of inodes returned
327 |  * in a search pass.  The _END output flag is set if all the results
328 |  * including last_ino were searched in this pass.
329 |  *
330 |  * It's valuable to provide a large inodes array so that all the results
331 |  * can be found in one search pass and _END can be set.  There are
332 |  * significant constant costs for performing each search pass.
333 |  */
334 | struct scoutfs_ioctl_search_xattrs {
335 | 	__u64 next_ino;
336 | 	__u64 last_ino;
337 | 	__u64 name_ptr;
338 | 	__u64 inodes_ptr;
339 | 	__u64 output_flags;
340 | 	__u64 nr_inodes;
341 | 	__u16 name_bytes;
342 | 	__u8 _pad[6];
343 | };
344 | 
345 | /* set in output_flags if returned inodes reached last_ino */
346 | #define SCOUTFS_SEARCH_XATTRS_OFLAG_END (1ULL << 0)
347 | 
348 | #define SCOUTFS_IOC_SEARCH_XATTRS _IOR(SCOUTFS_IOCTL_MAGIC, 9, \
349 | 				     struct scoutfs_ioctl_search_xattrs)
350 | 
351 | /*
352 |  * Give the user information about the filesystem.
353 |  *
354 |  * @valid_bytes stores the number of bytes that are valid in the
355 |  * structure.  The caller sets this to the size of the struct that they
356 |  * understand.  The kernel then fills and copies back the min of the
357 |  * size they and the user caller understand.  The user can tell if a
358 |  * field is set if all of its bytes are within the valid_bytes that the
359 |  * kernel set on return.
360 |  *
361 |  * @committed_seq: All seqs up to and including this seq have been
362 |  * committed.  Can be compared with meta_seq and data_seq from inodes in
363 |  * stat_more to discover if changes have been committed to disk.
364 |  *
365 |  * New fields are only added to the end of the struct.
366 |  */
367 | struct scoutfs_ioctl_statfs_more {
368 | 	__u64 valid_bytes;
369 | 	__u64 fsid;
370 | 	__u64 rid;
371 | 	__u64 committed_seq;
372 | 	__u64 total_meta_blocks;
373 | 	__u64 total_data_blocks;
374 | };
375 | 
376 | #define SCOUTFS_IOC_STATFS_MORE _IOR(SCOUTFS_IOCTL_MAGIC, 10, \
377 | 				     struct scoutfs_ioctl_statfs_more)
378 | 
379 | /*
380 |  * Cause matching waiters to return an error.
381 |  *
382 |  * Find current waiters that match the inode, op, and block range to wake
383 |  * up and return an error.
384 |  */
385 | struct scoutfs_ioctl_data_wait_err {
386 | 	__u64 ino;
387 | 	__u64 data_version;
388 | 	__u64 offset;
389 | 	__u64 count;
390 | 	__u64 op;
391 | 	__s64 err;
392 | };
393 | 
394 | #define SCOUTFS_IOC_DATA_WAIT_ERR _IOR(SCOUTFS_IOCTL_MAGIC, 11, \
395 | 				       struct scoutfs_ioctl_data_wait_err)
396 | 
397 | 
398 | #define SCOUTFS_IOC_ALLOC_DETAIL _IOR(SCOUTFS_IOCTL_MAGIC, 12, \
399 | 				     struct scoutfs_ioctl_alloc_detail)
400 | 
401 | struct scoutfs_ioctl_alloc_detail {
402 | 	__u64 entries_ptr;
403 | 	__u64 entries_nr;
404 | };
405 | 
406 | struct scoutfs_ioctl_alloc_detail_entry {
407 | 	__u64 id;
408 | 	__u64 blocks;
409 | 	__u8 type;
410 | 	__u8 meta:1,
411 | 	     avail:1;
412 | 	__u8 __bit_pad:6;
413 | 	__u8 __pad[6];
414 | };
415 | 
416 | #endif
417 | 


--------------------------------------------------------------------------------
/src/key.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SCOUTFS_KEY_H_
  2 | #define _SCOUTFS_KEY_H_
  3 | 
  4 | #include "sparse.h"
  5 | #include "util.h"
  6 | #include "format.h"
  7 | #include "cmp.h"
  8 | #include "endian_swap.h"
  9 | 
 10 | #define SK_FMT		"%u.%llu.%u.%llu.%llu.%u"
 11 | /* This does not support null keys */
 12 | #define SK_ARG(key)	(key)->sk_zone, le64_to_cpu((key)->_sk_first),	\
 13 | 			(key)->sk_type,	le64_to_cpu((key)->_sk_second),	\
 14 | 			le64_to_cpu((key)->_sk_third),			\
 15 | 			(key)->_sk_fourth
 16 | 
 17 | /*
 18 |  * copy fields between keys with the same fields but different types.
 19 |  * The destination type might have internal padding so we zero it.
 20 |  */
 21 | #define scoutfs_key_copy_types(a, b)		\
 22 | do {						\
 23 | 	__typeof__(a) _to = (a);		\
 24 | 	__typeof__(b) _from = (b);		\
 25 | 						\
 26 | 	memset(_to, 0, sizeof(*_to));		\
 27 | 	_to->sk_zone = _from->sk_zone;		\
 28 | 	_to->_sk_first = _from->_sk_first;	\
 29 | 	_to->sk_type = _from->sk_type;		\
 30 | 	_to->_sk_second = _from->_sk_second;	\
 31 | 	_to->_sk_third = _from->_sk_third;	\
 32 | 	_to->_sk_fourth = _from->_sk_fourth;	\
 33 | } while (0)
 34 | 
 35 | static inline void scoutfs_key_set_zeros(struct scoutfs_key *key)
 36 | {
 37 | 	key->sk_zone = 0;
 38 | 	key->_sk_first = 0;
 39 | 	key->sk_type = 0;
 40 | 	key->_sk_second = 0;
 41 | 	key->_sk_third = 0;
 42 | 	key->_sk_fourth = 0;
 43 | }
 44 | 
 45 | static inline void scoutfs_key_copy_or_zeros(struct scoutfs_key *dst,
 46 | 					     struct scoutfs_key *src)
 47 | {
 48 | 	if (src)
 49 | 		*dst = *src;
 50 | 	else
 51 | 		scoutfs_key_set_zeros(dst);
 52 | }
 53 | 
 54 | static inline void scoutfs_key_set_ones(struct scoutfs_key *key)
 55 | {
 56 | 	key->sk_zone = U8_MAX;
 57 | 	key->_sk_first = cpu_to_le64(U64_MAX);
 58 | 	key->sk_type = U8_MAX;
 59 | 	key->_sk_second = cpu_to_le64(U64_MAX);
 60 | 	key->_sk_third = cpu_to_le64(U64_MAX);
 61 | 	key->_sk_fourth = U8_MAX;
 62 | }
 63 | 
 64 | /*
 65 |  * Return a -1/0/1 comparison of keys.
 66 |  *
 67 |  * It turns out that these ternary chains are consistently cheaper than
 68 |  * other alternatives across keys that first differ in any of the
 69 |  * values.  Say maybe 20% faster than memcmp.
 70 |  */
 71 | static inline int scoutfs_key_compare(struct scoutfs_key *a,
 72 | 				      struct scoutfs_key *b)
 73 | {
 74 | 	return scoutfs_cmp(a->sk_zone, b->sk_zone) ?:
 75 | 	  scoutfs_cmp(le64_to_cpu(a->_sk_first), le64_to_cpu(b->_sk_first)) ?:
 76 | 	  scoutfs_cmp(a->sk_type, b->sk_type) ?:
 77 | 	  scoutfs_cmp(le64_to_cpu(a->_sk_second), le64_to_cpu(b->_sk_second)) ?:
 78 | 	  scoutfs_cmp(le64_to_cpu(a->_sk_third), le64_to_cpu(b->_sk_third)) ?:
 79 | 	  scoutfs_cmp(a->_sk_fourth, b->_sk_fourth);
 80 | }
 81 | 
 82 | /*
 83 |  * Compare ranges of keys where overlapping is equality.  Returns:
 84 |  *      -1: a_end < b_start
 85 |  *       1: a_start > b_end
 86 |  *  else 0: ranges overlap
 87 |  */
 88 | static inline int scoutfs_key_compare_ranges(struct scoutfs_key *a_start,
 89 | 				             struct scoutfs_key *a_end,
 90 | 				             struct scoutfs_key *b_start,
 91 | 				             struct scoutfs_key *b_end)
 92 | {
 93 | 	return scoutfs_key_compare(a_end, b_start) < 0 ? -1 :
 94 | 	       scoutfs_key_compare(a_start, b_end) > 0 ? 1 :
 95 | 	       0;
 96 | }
 97 | 
 98 | static inline void scoutfs_key_inc(struct scoutfs_key *key)
 99 | {
100 | 	if (++key->_sk_fourth != 0)
101 | 		return;
102 | 
103 | 	le64_add_cpu(&key->_sk_third, 1);
104 | 	if (key->_sk_third != 0)
105 | 		return;
106 | 
107 | 	le64_add_cpu(&key->_sk_second, 1);
108 | 	if (key->_sk_second != 0)
109 | 		return;
110 | 
111 | 	if (++key->sk_type != 0)
112 | 		return;
113 | 
114 | 	le64_add_cpu(&key->_sk_first, 1);
115 | 	if (key->_sk_first != 0)
116 | 		return;
117 | 
118 | 	key->sk_zone++;
119 | }
120 | 
121 | static inline void scoutfs_key_dec(struct scoutfs_key *key)
122 | {
123 | 	if (--key->_sk_fourth != U8_MAX)
124 | 		return;
125 | 
126 | 	le64_add_cpu(&key->_sk_third, -1);
127 | 	if (key->_sk_third != cpu_to_le64(U64_MAX))
128 | 		return;
129 | 
130 | 	le64_add_cpu(&key->_sk_second, -1);
131 | 	if (key->_sk_second != cpu_to_le64(U64_MAX))
132 | 		return;
133 | 
134 | 	if (--key->sk_type != U8_MAX)
135 | 		return;
136 | 
137 | 	le64_add_cpu(&key->_sk_first, -1);
138 | 	if (key->_sk_first != cpu_to_le64(U64_MAX))
139 | 		return;
140 | 
141 | 	key->sk_zone--;
142 | }
143 | 
144 | #endif
145 | 


--------------------------------------------------------------------------------
/src/leaf_item_hash.c:
--------------------------------------------------------------------------------
 1 | #include "sparse.h"
 2 | #include "util.h"
 3 | #include "format.h"
 4 | #include "hash.h"
 5 | #include "leaf_item_hash.h"
 6 | 
 7 | /*
 8 |  * A minimal extraction of the leaf item hash from the kernel's btree.
 9 |  */
10 | 
11 | int leaf_item_hash_ind(struct scoutfs_key *key)
12 | {
13 | 	return scoutfs_hash32(key, sizeof(struct scoutfs_key)) %
14 | 	       SCOUTFS_BTREE_LEAF_ITEM_HASH_NR;
15 | }
16 | 
17 | __le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt)
18 | {
19 | 	return (void *)bt + SCOUTFS_BLOCK_LG_SIZE -
20 | 		SCOUTFS_BTREE_LEAF_ITEM_HASH_BYTES;
21 | }
22 | 
23 | void leaf_item_hash_insert(struct scoutfs_btree_block *bt,
24 | 			   struct scoutfs_key *key, __le16 off)
25 | {
26 | 	__le16 *buckets = leaf_item_hash_buckets(bt);
27 | 	int i;
28 | 
29 | 	if (bt->level > 0)
30 | 		return;
31 | 
32 | 	for (i = leaf_item_hash_ind(key);
33 | 	     i < SCOUTFS_BTREE_LEAF_ITEM_HASH_NR; i++) {
34 | 		if (buckets[i] == 0) {
35 | 			buckets[i] = off;
36 | 			return;
37 | 		}
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/leaf_item_hash.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LEAF_ITEM_HASH_H_
 2 | #define _LEAF_ITEM_HASH_H_
 3 | 
 4 | int leaf_item_hash_ind(struct scoutfs_key *key);
 5 | __le16 *leaf_item_hash_buckets(struct scoutfs_btree_block *bt);
 6 | void leaf_item_hash_insert(struct scoutfs_btree_block *bt,
 7 | 			   struct scoutfs_key *key, __le16 off);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/src/list.h:
--------------------------------------------------------------------------------
  1 | /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  2 |  * vim:expandtab:shiftwidth=8:tabstop=8:
  3 |  *
  4 |  *      This program is free software; you can redistribute it and/or modify
  5 |  *      it under the terms of the GNU General Public License version 2 as
  6 |  *      published by the Free Software Foundation.
  7 |  *
  8 |  *      This program is distributed in the hope that it will be useful,
  9 |  *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 |  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 |  *      GNU General Public License for more details.
 12 |  *
 13 |  *      You should have received a copy of the GNU General Public License
 14 |  *      along with this program; if not, write to the Free Software
 15 |  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 16 |  *
 17 |  *    yanked from the linux kernel..
 18 |  */
 19 | #ifndef _LIST_H_
 20 | #define _LIST_H_
 21 | /*
 22 |  * Simple doubly linked list implementation.
 23 |  *
 24 |  * Some of the internal functions ("__xxx") are useful when
 25 |  * manipulating whole lists rather than single entries, as
 26 |  * sometimes we already know the next/prev entries and we can
 27 |  * generate better code by using them directly rather than
 28 |  * using the generic single-entry routines.
 29 |  */
 30 | 
 31 | struct list_head {
 32 |         struct list_head *next, *prev;
 33 | };
 34 | 
 35 | #define LIST_HEAD_INIT(name) { &(name), &(name) }
 36 | 
 37 | #define LIST_HEAD(name) \
 38 |         struct list_head name = LIST_HEAD_INIT(name)
 39 | 
 40 | #define INIT_LIST_HEAD(ptr) do { \
 41 |         (ptr)->next = (ptr); (ptr)->prev = (ptr); \
 42 | } while (0)
 43 | 
 44 | /*
 45 |  * Insert a new entry between two known consecutive entries. 
 46 |  *
 47 |  * This is only for internal list manipulation where we know
 48 |  * the prev/next entries already!
 49 |  */
 50 | static inline void __list_add(struct list_head *new,
 51 |                               struct list_head *prev,
 52 |                               struct list_head *next)
 53 | {
 54 |         next->prev = new;
 55 |         new->next = next;
 56 |         new->prev = prev;
 57 |         prev->next = new;
 58 | }
 59 | 
 60 | /**
 61 |  * list_add - add a new entry
 62 |  * @new: new entry to be added
 63 |  * @head: list head to add it after
 64 |  *
 65 |  * Insert a new entry after the specified head.
 66 |  * This is good for implementing stacks.
 67 |  */
 68 | static inline void list_add(struct list_head *new, struct list_head *head)
 69 | {
 70 |         __list_add(new, head, head->next);
 71 | }
 72 | 
 73 | /**
 74 |  * list_add_tail - add a new entry
 75 |  * @new: new entry to be added
 76 |  * @head: list head to add it before
 77 |  *
 78 |  * Insert a new entry before the specified head.
 79 |  * This is useful for implementing queues.
 80 |  */
 81 | static inline void list_add_tail(struct list_head *new, struct list_head *head)
 82 | {
 83 |         __list_add(new, head->prev, head);
 84 | }
 85 | 
 86 | /*
 87 |  * Insert a new entry between two known consecutive entries. 
 88 |  *
 89 |  * This is only for internal list manipulation where we know
 90 |  * the prev/next entries already!
 91 |  */
 92 | static __inline__ void __list_add_rcu(struct list_head * new,
 93 |         struct list_head * prev,
 94 |         struct list_head * next)
 95 | {
 96 |         new->next = next;
 97 |         new->prev = prev;
 98 |         next->prev = new;
 99 |         prev->next = new;
100 | }
101 | 
102 | /*
103 |  * Delete a list entry by making the prev/next entries
104 |  * point to each other.
105 |  *
106 |  * This is only for internal list manipulation where we know
107 |  * the prev/next entries already!
108 |  */
109 | static inline void __list_del(struct list_head * prev, struct list_head * next)
110 | {
111 |         next->prev = prev;
112 |         prev->next = next;
113 | }
114 | 
115 | /**
116 |  * list_del - deletes entry from list.
117 |  * @entry: the element to delete from the list.
118 |  * Note: list_empty on entry does not return true after this, the entry is
119 |  * in an undefined state.
120 |  */
121 | static inline void list_del(struct list_head *entry)
122 | {
123 |         __list_del(entry->prev, entry->next);
124 | }
125 | 
126 | /**
127 |  * list_del_init - deletes entry from list and reinitialize it.
128 |  * @entry: the element to delete from the list.
129 |  */
130 | static inline void list_del_init(struct list_head *entry)
131 | {
132 |         __list_del(entry->prev, entry->next);
133 |         INIT_LIST_HEAD(entry); 
134 | }
135 | 
136 | /**
137 |  * list_move - delete from one list and add as another's head
138 |  * @list: the entry to move
139 |  * @head: the head that will precede our entry
140 |  */
141 | static inline void list_move(struct list_head *list, struct list_head *head)
142 | {
143 |         __list_del(list->prev, list->next);
144 |         list_add(list, head);
145 | }
146 | 
147 | /**
148 |  * list_move_tail - delete from one list and add as another's tail
149 |  * @list: the entry to move
150 |  * @head: the head that will follow our entry
151 |  */
152 | static inline void list_move_tail(struct list_head *list,
153 |                                   struct list_head *head)
154 | {
155 |         __list_del(list->prev, list->next);
156 |         list_add_tail(list, head);
157 | }
158 | 
159 | /**
160 |  * list_empty - tests whether a list is empty
161 |  * @head: the list to test.
162 |  */
163 | static inline int list_empty(struct list_head *head)
164 | {
165 |         return head->next == head;
166 | }
167 | 
168 | static inline void __list_splice(struct list_head *list,
169 |                                  struct list_head *head)
170 | {
171 |         struct list_head *first = list->next;
172 |         struct list_head *last = list->prev;
173 |         struct list_head *at = head->next;
174 | 
175 |         first->prev = head;
176 |         head->next = first;
177 | 
178 |         last->next = at;
179 |         at->prev = last;
180 | }
181 | 
182 | /**
183 |  * list_splice - join two lists
184 |  * @list: the new list to add.
185 |  * @head: the place to add it in the first list.
186 |  */
187 | static inline void list_splice(struct list_head *list, struct list_head *head)
188 | {
189 |         if (!list_empty(list))
190 |                 __list_splice(list, head);
191 | }
192 | 
193 | /**
194 |  * list_splice_init - join two lists and reinitialise the emptied list.
195 |  * @list: the new list to add.
196 |  * @head: the place to add it in the first list.
197 |  *
198 |  * The list at @list is reinitialised
199 |  */
200 | static inline void list_splice_init(struct list_head *list,
201 |                                     struct list_head *head)
202 | {
203 |         if (!list_empty(list)) {
204 |                 __list_splice(list, head);
205 |                 INIT_LIST_HEAD(list);
206 |         }
207 | }
208 | 
209 | /**
210 |  * list_entry - get the struct for this entry
211 |  * @ptr:        the &struct list_head pointer.
212 |  * @type:        the type of the struct this is embedded in.
213 |  * @member:        the name of the list_struct within the struct.
214 |  */
215 | #define list_entry(ptr, type, member) \
216 |         ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
217 | 
218 | 
219 | /**
220 |  * list_for_each        -        iterate over a list
221 |  * @pos:        the &struct list_head to use as a loop counter.
222 |  * @head:        the head for your list.
223 |  */
224 | #define list_for_each(pos, head) \
225 |         for (pos = (head)->next; pos != (head); pos = pos->next)
226 | 
227 | /**
228 |  * list_for_each_prev        -        iterate over a list backwards
229 |  * @pos:        the &struct list_head to use as a loop counter.
230 |  * @head:        the head for your list.
231 |  */
232 | #define list_for_each_prev(pos, head) \
233 |         for (pos = (head)->prev; pos != (head); pos = pos->prev)
234 |                 
235 | /**
236 |  * list_for_each_safe        -        iterate over a list safe against removal of list entry
237 |  * @pos:        the &struct list_head to use as a loop counter.
238 |  * @n:                another &struct list_head to use as temporary storage
239 |  * @head:        the head for your list.
240 |  */
241 | #define list_for_each_safe(pos, n, head) \
242 |         for (pos = (head)->next, n = pos->next; pos != (head); \
243 |                 pos = n, n = pos->next)
244 | 
245 | /**
246 |  * list_for_each_entry        -        iterate over list of given type
247 |  * @pos:        the type * to use as a loop counter.
248 |  * @head:        the head for your list.
249 |  * @member:        the name of the list_struct within the struct.
250 |  */
251 | #define list_for_each_entry(pos, head, member)                                \
252 |         for (pos = list_entry((head)->next, typeof(*pos), member);        \
253 |              &pos->member != (head);                                         \
254 |              pos = list_entry(pos->member.next, typeof(*pos), member))
255 | 
256 | /**
257 |  * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
258 |  * @pos:	the type * to use as a loop cursor.
259 |  * @n:		another type * to use as temporary storage
260 |  * @head:	the head for your list.
261 |  * @member:	the name of the list_struct within the struct.
262 |  */
263 | #define list_for_each_entry_safe(pos, n, head, member)			\
264 | 	for (pos = list_entry((head)->next, typeof(*pos), member),	\
265 | 		n = list_entry(pos->member.next, typeof(*pos), member);	\
266 | 	     &pos->member != (head); 					\
267 | 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
268 | 
269 | /**
270 |  * list_first_entry - get the first element from a list
271 |  * @ptr:        the list head to take the element from.
272 |  * @type:       the type of the struct this is embedded in.
273 |  * @member:     the name of the list_head within the struct.
274 |  *
275 |  * Note, that list is expected to be not empty.
276 |  */
277 | #define list_first_entry(ptr, type, member) \
278 |         list_entry((ptr)->next, type, member)
279 | 
280 | /**
281 |  * list_last_entry - get the last element from a list
282 |  * @ptr:        the list head to take the element from.
283 |  * @type:       the type of the struct this is embedded in.
284 |  * @member:     the name of the list_head within the struct.
285 |  *
286 |  * Note, that list is expected to be not empty.
287 |  */
288 | #define list_last_entry(ptr, type, member) \
289 |         list_entry((ptr)->prev, type, member)
290 | 
291 | /**
292 |  * list_first_entry_or_null - get the first element from a list
293 |  * @ptr:        the list head to take the element from.
294 |  * @type:       the type of the struct this is embedded in.
295 |  * @member:     the name of the list_head within the struct.
296 |  *
297 |  * Note that if the list is empty, it returns NULL.
298 |  */
299 | #define list_first_entry_or_null(ptr, type, member) \
300 |         (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
301 | 
302 | /**
303 |  * list_next_entry - get the next element in list
304 |  * @pos:        the type * to cursor
305 |  * @member:     the name of the list_head within the struct.
306 |  */
307 | #define list_next_entry(pos, member) \
308 |         list_entry((pos)->member.next, typeof(*(pos)), member)
309 | 
310 | #endif
311 | 


--------------------------------------------------------------------------------
/src/listxattr_hidden.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <getopt.h>
 11 | #include <ctype.h>
 12 | 
 13 | #include "sparse.h"
 14 | #include "util.h"
 15 | #include "format.h"
 16 | #include "ioctl.h"
 17 | #include "cmd.h"
 18 | 
 19 | static struct option long_ops[] = {
 20 | 	{ "file", 1, NULL, 'f' },
 21 | 	{ NULL, 0, NULL, 0}
 22 | };
 23 | 
 24 | static int listxattr_hidden_cmd(int argc, char **argv)
 25 | {
 26 | 	struct scoutfs_ioctl_listxattr_hidden lxh;
 27 | 	char *path = NULL;
 28 | 	char *buf = NULL;
 29 | 	char *name;
 30 | 	int fd = -1;
 31 | 	int bytes;
 32 | 	int len;
 33 | 	int ret;
 34 | 	int c;
 35 | 	int i;
 36 | 
 37 | 	while ((c = getopt_long(argc, argv, "f:", long_ops, NULL)) != -1) {
 38 | 		switch (c) {
 39 | 		case 'f':
 40 | 			path = strdup(optarg);
 41 | 			if (!path) {
 42 | 				fprintf(stderr, "path mem alloc failed\n");
 43 | 				ret = -ENOMEM;
 44 | 				goto out;
 45 | 			}
 46 | 			break;
 47 | 		case '?':
 48 | 		default:
 49 | 			ret = -EINVAL;
 50 | 			goto out;
 51 | 		}
 52 | 	}
 53 | 
 54 | 	if (path == NULL) {
 55 | 		fprintf(stderr, "must specify -f path to file\n");
 56 | 		ret = -EINVAL;
 57 | 		goto out;
 58 | 	}
 59 | 
 60 | 	memset(&lxh, 0, sizeof(lxh));
 61 | 	lxh.id_pos = 0;
 62 | 	lxh.hash_pos = 0;
 63 | 	lxh.buf_bytes = 256 * 1024;
 64 | 
 65 | 	buf = malloc(lxh.buf_bytes);
 66 | 	if (!buf) {
 67 | 		fprintf(stderr, "xattr name buf alloc failed\n");
 68 | 		return -ENOMEM;
 69 | 	}
 70 | 	lxh.buf_ptr = (unsigned long)buf;
 71 | 
 72 | 	fd = open(path, O_RDONLY);
 73 | 	if (fd < 0) {
 74 | 		ret = -errno;
 75 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 76 | 			path, strerror(errno), errno);
 77 | 		goto out;
 78 | 	}
 79 | 
 80 | 	for (;;) {
 81 | 
 82 | 		ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
 83 | 		if (ret == 0)
 84 | 			break;
 85 | 		if (ret < 0) {
 86 | 			ret = -errno;
 87 | 			fprintf(stderr, "listxattr_hidden ioctl failed: "
 88 | 				"%s (%d)\n", strerror(errno), errno);
 89 | 			goto out;
 90 | 		}
 91 | 
 92 | 		bytes = ret;
 93 | 
 94 | 		if (bytes > lxh.buf_bytes) {
 95 | 			fprintf(stderr, "listxattr_hidden overflowed\n");
 96 | 			ret = -EFAULT;
 97 | 			goto out;
 98 | 		}
 99 | 		if (buf[bytes - 1] != '\0') {
100 | 			fprintf(stderr, "listxattr_hidden didn't term\n");
101 | 			ret = -EINVAL;
102 | 			goto out;
103 | 		}
104 | 
105 | 		name = buf;
106 | 
107 | 		do {
108 | 			len = strlen(name);
109 | 			if (len == 0) {
110 | 				fprintf(stderr, "listxattr_hidden empty name\n");
111 | 				ret = -EINVAL;
112 | 				goto out;
113 | 			}
114 | 
115 | 			if (len > SCOUTFS_XATTR_MAX_NAME_LEN) {
116 | 				fprintf(stderr, "listxattr_hidden long name\n");
117 | 				ret = -EINVAL;
118 | 				goto out;
119 | 			}
120 | 
121 | 			for (i = 0; i < len; i++) {
122 | 				if (!isprint(name[i]))
123 | 					name[i] = '?';
124 | 			}
125 | 
126 | 			printf("%s\n", name);
127 | 			name += len + 1;
128 | 			bytes -= len + 1;
129 | 
130 | 		} while (bytes > 0);
131 | 	}
132 | 
133 | 	ret = 0;
134 | out:
135 | 	if (fd >= 0)
136 | 		close(fd);
137 | 	free(buf);
138 | 
139 | 	return ret;
140 | };
141 | 
142 | static void __attribute__((constructor)) listxattr_hidden_ctor(void)
143 | {
144 | 	cmd_register("listxattr-hidden", "-f <path>",
145 | 		     "print the names of hidden xattrs on the file",
146 | 		     listxattr_hidden_cmd);
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | #include <stdbool.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "cmd.h"
 9 | #include "util.h"
10 | 
11 | int main(int argc, char **argv)
12 | {
13 | 	/*
14 | 	 * XXX parse global options, env, configs, etc.
15 | 	 */
16 | 
17 | 	return cmd_execute(argc, argv);
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mkfs.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdbool.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <errno.h>
  7 | #include <sys/time.h>
  8 | #include <uuid/uuid.h>
  9 | #include <fcntl.h>
 10 | #include <sys/types.h>
 11 | #include <sys/stat.h>
 12 | #include <unistd.h>
 13 | #include <assert.h>
 14 | #include <getopt.h>
 15 | #include <sys/socket.h>
 16 | #include <netinet/in.h>
 17 | #include <arpa/inet.h>
 18 | #include <ctype.h>
 19 | #include <inttypes.h>
 20 | 
 21 | #include "sparse.h"
 22 | #include "cmd.h"
 23 | #include "util.h"
 24 | #include "format.h"
 25 | #include "parse.h"
 26 | #include "crc.h"
 27 | #include "rand.h"
 28 | #include "dev.h"
 29 | #include "key.h"
 30 | #include "bitops.h"
 31 | #include "btree.h"
 32 | #include "leaf_item_hash.h"
 33 | 
 34 | static int write_raw_block(int fd, u64 blkno, int shift, void *blk)
 35 | {
 36 | 	size_t size = 1ULL << shift;
 37 | 	ssize_t ret;
 38 | 
 39 | 	ret = pwrite(fd, blk, size, blkno << shift);
 40 | 	if (ret != size) {
 41 | 		fprintf(stderr, "write to blkno %llu returned %zd: %s (%d)\n",
 42 | 			blkno, ret, strerror(errno), errno);
 43 | 		return -errno;
 44 | 	}
 45 | 
 46 | 	return 0;
 47 | }
 48 | 
 49 | /*
 50 |  * Update the block's header and write it out.
 51 |  */
 52 | static int write_block(int fd, u64 blkno, int shift,
 53 | 		       struct scoutfs_super_block *super,
 54 | 		       struct scoutfs_block_header *hdr)
 55 | {
 56 | 	size_t size = 1ULL << shift;
 57 | 
 58 | 	if (super)
 59 | 		*hdr = super->hdr;
 60 | 	hdr->blkno = cpu_to_le64(blkno);
 61 | 	hdr->crc = cpu_to_le32(crc_block(hdr, size));
 62 | 
 63 | 	return write_raw_block(fd, blkno, shift, hdr);
 64 | }
 65 | 
 66 | /*
 67 |  * Write the single btree block that contains the blkno and len indexed
 68 |  * items to store the given extent, and update the root to point to it.
 69 |  */
 70 | static int write_alloc_root(struct scoutfs_super_block *super, int fd,
 71 | 			    struct scoutfs_alloc_root *root,
 72 | 			    struct scoutfs_btree_block *bt,
 73 | 			    u64 blkno, u64 start, u64 len)
 74 | {
 75 | 	struct scoutfs_key key;
 76 | 
 77 | 	btree_init_root_single(&root->root, bt, blkno, 1, super->hdr.fsid);
 78 | 	root->total_len = cpu_to_le64(len);
 79 | 
 80 | 	memset(&key, 0, sizeof(key));
 81 | 	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
 82 | 	key.sk_type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
 83 | 	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
 84 | 	key.skfb_end = cpu_to_le64(start + len - 1);
 85 | 	key.skfb_len = cpu_to_le64(len);
 86 | 	btree_append_item(bt, &key, NULL, 0);
 87 | 
 88 | 	memset(&key, 0, sizeof(key));
 89 | 	key.sk_zone = SCOUTFS_FREE_EXTENT_ZONE;
 90 | 	key.sk_type = SCOUTFS_FREE_EXTENT_LEN_TYPE;
 91 | 	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
 92 | 	key.skfl_neglen = cpu_to_le64(-len);
 93 | 	key.skfl_blkno = cpu_to_le64(start);
 94 | 	btree_append_item(bt, &key, NULL, 0);
 95 | 
 96 | 	bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr,
 97 | 					    SCOUTFS_BLOCK_LG_SIZE));
 98 | 
 99 | 	return write_raw_block(fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, bt);
100 | }
101 | 
102 | /*
103 |  * Make a new file system by writing:
104 |  *  - super blocks
105 |  *  - btree ring blocks with manifest and allocator btree blocks
106 |  *  - segment with root inode items
107 |  *
108 |  * Superblock is written to both metadata and data devices, everything else is
109 |  * written only to the metadata device.
110 |  */
111 | static int write_new_fs(char *meta_path, char *data_path,
112 | 			int meta_fd, int data_fd,
113 | 			u8 quorum_count,
114 | 			u64 max_meta_size, u64 max_data_size)
115 | {
116 | 	struct scoutfs_super_block *super;
117 | 	struct scoutfs_inode inode;
118 | 	struct scoutfs_alloc_list_block *lblk;
119 | 	struct scoutfs_btree_block *bt;
120 | 	struct scoutfs_key key;
121 | 	struct timeval tv;
122 | 	char uuid_str[37];
123 | 	void *zeros;
124 | 	u64 blkno;
125 | 	u64 meta_size;
126 | 	u64 data_size;
127 | 	u64 next_meta;
128 | 	u64 last_meta;
129 | 	u64 first_data;
130 | 	u64 last_data;
131 | 	u64 meta_start;
132 | 	u64 meta_len;
133 | 	int ret;
134 | 	int i;
135 | 
136 | 	gettimeofday(&tv, NULL);
137 | 
138 | 	super = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
139 | 	bt = calloc(1, SCOUTFS_BLOCK_LG_SIZE);
140 | 	zeros = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
141 | 	if (!super || !bt || !zeros) {
142 | 		ret = -errno;
143 | 		fprintf(stderr, "failed to allocate block mem: %s (%d)\n",
144 | 			strerror(errno), errno);
145 | 		goto out;
146 | 	}
147 | 
148 | 	ret = device_size(meta_path, meta_fd, 2ULL * (1024 * 1024 * 1024),
149 | 			  max_meta_size, "meta", &meta_size);
150 | 	if (ret)
151 | 		goto out;
152 | 
153 | 	ret = device_size(data_path, data_fd, 8ULL * (1024 * 1024 * 1024),
154 | 			  max_data_size, "data", &data_size);
155 | 	if (ret)
156 | 		goto out;
157 | 
158 | 	/* metadata blocks start after the quorum blocks */
159 | 	next_meta = (SCOUTFS_QUORUM_BLKNO + SCOUTFS_QUORUM_BLOCKS) >>
160 | 		    SCOUTFS_BLOCK_SM_LG_SHIFT;
161 | 	/* rest of meta dev is available for metadata blocks */
162 | 	last_meta = (meta_size >> SCOUTFS_BLOCK_LG_SHIFT) - 1;
163 | 	/* Data blocks go on the data dev */
164 | 	first_data = SCOUTFS_DATA_DEV_START_BLKNO;
165 | 	last_data = (data_size >> SCOUTFS_BLOCK_SM_SHIFT) - 1;
166 | 
167 | 	/* partially initialize the super so we can use it to init others */
168 | 	memset(super, 0, SCOUTFS_BLOCK_SM_SIZE);
169 | 	pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
170 | 	super->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_SUPER);
171 | 	super->hdr.seq = cpu_to_le64(1);
172 | 	super->format_hash = cpu_to_le64(SCOUTFS_FORMAT_HASH);
173 | 	uuid_generate(super->uuid);
174 | 	super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
175 | 	super->next_trans_seq = cpu_to_le64(1);
176 | 	super->total_meta_blocks = cpu_to_le64(last_meta + 1);
177 | 	super->first_meta_blkno = cpu_to_le64(next_meta);
178 | 	super->last_meta_blkno = cpu_to_le64(last_meta);
179 | 	super->total_data_blocks = cpu_to_le64(last_data - first_data + 1);
180 | 	super->first_data_blkno = cpu_to_le64(first_data);
181 | 	super->last_data_blkno = cpu_to_le64(last_data);
182 | 	super->quorum_count = quorum_count;
183 | 
184 | 	/* fs root starts with root inode and its index items */
185 | 	blkno = next_meta++;
186 | 	btree_init_root_single(&super->fs_root, bt, blkno, 1, super->hdr.fsid);
187 | 
188 | 	memset(&key, 0, sizeof(key));
189 | 	key.sk_zone = SCOUTFS_INODE_INDEX_ZONE;
190 | 	key.sk_type = SCOUTFS_INODE_INDEX_META_SEQ_TYPE;
191 | 	key.skii_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
192 | 	btree_append_item(bt, &key, NULL, 0);
193 | 
194 | 	memset(&key, 0, sizeof(key));
195 | 	key.sk_zone = SCOUTFS_FS_ZONE;
196 | 	key.ski_ino = cpu_to_le64(SCOUTFS_ROOT_INO);
197 | 	key.sk_type = SCOUTFS_INODE_TYPE;
198 | 
199 | 	memset(&inode, 0, sizeof(inode));
200 | 	inode.next_readdir_pos = cpu_to_le64(2);
201 | 	inode.nlink = cpu_to_le32(SCOUTFS_DIRENT_FIRST_POS);
202 | 	inode.mode = cpu_to_le32(0755 | 0040000);
203 | 	inode.atime.sec = cpu_to_le64(tv.tv_sec);
204 | 	inode.atime.nsec = cpu_to_le32(tv.tv_usec * 1000);
205 | 	inode.ctime.sec = inode.atime.sec;
206 | 	inode.ctime.nsec = inode.atime.nsec;
207 | 	inode.mtime.sec = inode.atime.sec;
208 | 	inode.mtime.nsec = inode.atime.nsec;
209 | 	btree_append_item(bt, &key, &inode, sizeof(inode));
210 | 
211 | 	bt->hdr.crc = cpu_to_le32(crc_block(&bt->hdr,
212 | 					    SCOUTFS_BLOCK_LG_SIZE));
213 | 
214 | 	ret = write_raw_block(meta_fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, bt);
215 | 	if (ret)
216 | 		goto out;
217 | 
218 | 	/* fill an avail list block for the first server transaction */
219 | 	blkno = next_meta++;
220 | 	lblk = (void *)bt;
221 | 	memset(lblk, 0, SCOUTFS_BLOCK_LG_SIZE);
222 | 
223 | 	lblk->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
224 | 	lblk->hdr.fsid = super->hdr.fsid;
225 | 	lblk->hdr.blkno = cpu_to_le64(blkno);
226 | 	lblk->hdr.seq = cpu_to_le64(1);
227 | 
228 | 	meta_len = (64 * 1024 * 1024) >> SCOUTFS_BLOCK_LG_SHIFT;
229 | 	for (i = 0; i < meta_len; i++) {
230 | 		lblk->blknos[i] = cpu_to_le64(next_meta);
231 | 		next_meta++;
232 | 	}
233 | 	lblk->nr = cpu_to_le32(i);
234 | 
235 | 	super->server_meta_avail[0].ref.blkno = lblk->hdr.blkno;
236 | 	super->server_meta_avail[0].ref.seq = lblk->hdr.seq;
237 | 	super->server_meta_avail[0].total_nr = le32_to_le64(lblk->nr);
238 | 	super->server_meta_avail[0].first_nr = lblk->nr;
239 | 
240 | 	lblk->hdr.crc = cpu_to_le32(crc_block(&bt->hdr, SCOUTFS_BLOCK_LG_SIZE));
241 | 	ret = write_raw_block(meta_fd, blkno, SCOUTFS_BLOCK_LG_SHIFT, lblk);
242 | 	if (ret)
243 | 		goto out;
244 | 
245 | 	/* the data allocator has a single extent */
246 | 	blkno = next_meta++;
247 | 	ret = write_alloc_root(super, meta_fd, &super->data_alloc, bt,
248 | 			       blkno, first_data,
249 | 			       le64_to_cpu(super->total_data_blocks));
250 | 	if (ret < 0)
251 | 		goto out;
252 | 
253 | 	/*
254 | 	 * Initialize all the meta_alloc roots with an equal portion of
255 | 	 * the free metadata extents, excluding the blocks we're going
256 | 	 * to use for the allocators.
257 | 	 */
258 | 	meta_start = next_meta + array_size(super->meta_alloc);
259 | 	meta_len = DIV_ROUND_UP(last_meta - meta_start + 1,
260 | 			        array_size(super->meta_alloc));
261 | 
262 | 	/* each meta alloc root contains a portion of free metadata extents */
263 | 	for (i = 0; i < array_size(super->meta_alloc); i++) {
264 | 		blkno = next_meta++;
265 | 		ret = write_alloc_root(super, meta_fd, &super->meta_alloc[i], bt,
266 | 				       blkno, meta_start,
267 | 				       min(meta_len,
268 | 					   last_meta - meta_start + 1));
269 | 		if (ret < 0)
270 | 			goto out;
271 | 
272 | 		meta_start += meta_len;
273 | 	}
274 | 
275 | 	/* zero out quorum blocks */
276 | 	for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
277 | 		ret = write_raw_block(meta_fd, SCOUTFS_QUORUM_BLKNO + i,
278 | 				      SCOUTFS_BLOCK_SM_SHIFT, zeros);
279 | 		if (ret < 0) {
280 | 			fprintf(stderr, "error zeroing quorum block: %s (%d)\n",
281 | 				strerror(-errno), -errno);
282 | 			goto out;
283 | 		}
284 | 	}
285 | 
286 | 	/* write the super block to data dev and meta dev*/
287 | 	super->hdr.seq = cpu_to_le64(1);
288 | 	ret = write_block(data_fd, SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT,
289 | 			  NULL, &super->hdr);
290 | 	if (ret)
291 | 		goto out;
292 | 
293 | 	if (fsync(data_fd)) {
294 | 		ret = -errno;
295 | 		fprintf(stderr, "failed to fsync '%s': %s (%d)\n",
296 | 			data_path, strerror(errno), errno);
297 | 		goto out;
298 | 	}
299 | 
300 | 	super->flags |= cpu_to_le64(SCOUTFS_FLAG_IS_META_BDEV);
301 | 	ret = write_block(meta_fd, SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT,
302 | 			  NULL, &super->hdr);
303 | 	if (ret)
304 | 		goto out;
305 | 
306 | 	if (fsync(meta_fd)) {
307 | 		ret = -errno;
308 | 		fprintf(stderr, "failed to fsync '%s': %s (%d)\n",
309 | 			meta_path, strerror(errno), errno);
310 | 		goto out;
311 | 	}
312 | 
313 | 	uuid_unparse(super->uuid, uuid_str);
314 | 
315 | 	printf("Created scoutfs filesystem:\n"
316 | 	       "  meta device path:     %s\n"
317 | 	       "  data device path:     %s\n"
318 | 	       "  fsid:                 %llx\n"
319 | 	       "  format hash:          %llx\n"
320 | 	       "  uuid:                 %s\n"
321 | 	       "  64KB metadata blocks: "SIZE_FMT"\n"
322 | 	       "  4KB data blocks:      "SIZE_FMT"\n"
323 | 	       "  quorum count:         %u\n",
324 | 		meta_path,
325 | 		data_path,
326 | 		le64_to_cpu(super->hdr.fsid),
327 | 		le64_to_cpu(super->format_hash),
328 | 		uuid_str,
329 | 		SIZE_ARGS(le64_to_cpu(super->total_meta_blocks),
330 | 			  SCOUTFS_BLOCK_LG_SIZE),
331 | 		SIZE_ARGS(le64_to_cpu(super->total_data_blocks),
332 | 			  SCOUTFS_BLOCK_SM_SIZE),
333 | 		super->quorum_count);
334 | 
335 | 	ret = 0;
336 | out:
337 | 	if (super)
338 | 		free(super);
339 | 	if (bt)
340 | 		free(bt);
341 | 	if (zeros)
342 | 		free(zeros);
343 | 	return ret;
344 | }
345 | 
346 | static struct option long_ops[] = {
347 | 	{ "quorum_count", 1, NULL, 'Q' },
348 | 	{ NULL, 0, NULL, 0}
349 | };
350 | 
351 | static int mkfs_func(int argc, char *argv[])
352 | {
353 | 	unsigned long long ull;
354 | 	u8 quorum_count = 0;
355 | 	u64 max_data_size = 0;
356 | 	u64 max_meta_size = 0;
357 | 	char *end = NULL;
358 | 	char *meta_path;
359 | 	char *data_path;
360 | 	int meta_fd;
361 | 	int data_fd;
362 | 	int ret;
363 | 	int c;
364 | 
365 | 	while ((c = getopt_long(argc, argv, "Q:D:M:", long_ops, NULL)) != -1) {
366 | 		switch (c) {
367 | 		case 'Q':
368 | 			ull = strtoull(optarg, &end, 0);
369 | 			if (*end != '\0' || ull == 0 ||
370 | 			    ull > SCOUTFS_QUORUM_MAX_COUNT) {
371 | 				printf("scoutfs: invalid quorum count '%s'\n",
372 | 					optarg);
373 | 				return -EINVAL;
374 | 			}
375 | 			quorum_count = ull;
376 | 			break;
377 | 		case 'D':
378 | 			ret = parse_human(optarg, &max_data_size);
379 | 			if (ret < 0) {
380 | 				printf("scoutfs: invalid data device size '%s'\n",
381 | 					optarg);
382 | 				return ret;
383 | 			}
384 | 			break;
385 | 		case 'M':
386 | 			ret = parse_human(optarg, &max_meta_size);
387 | 			if (ret < 0) {
388 | 				printf("scoutfs: invalid meta device size '%s'\n",
389 | 					optarg);
390 | 				return ret;
391 | 			}
392 | 			break;
393 | 		case '?':
394 | 		default:
395 | 			return -EINVAL;
396 | 		}
397 | 	}
398 | 
399 | 	if (optind + 2 != argc) {
400 | 		printf("scoutfs: mkfs: paths to metadata and data devices are required\n");
401 | 		return -EINVAL;
402 | 	}
403 | 
404 | 	meta_path = argv[optind];
405 | 	data_path = argv[optind + 1];
406 | 
407 | 	if (!quorum_count) {
408 | 		printf("provide quorum count with --quorum_count|-Q option\n");
409 | 		return -EINVAL;
410 | 	}
411 | 
412 | 	meta_fd = open(meta_path, O_RDWR | O_EXCL);
413 | 	if (meta_fd < 0) {
414 | 		ret = -errno;
415 | 		fprintf(stderr, "failed to open metadata device '%s': %s (%d)\n",
416 | 			meta_path, strerror(errno), errno);
417 | 		return ret;
418 | 	}
419 | 
420 | 	data_fd = open(data_path, O_RDWR | O_EXCL);
421 | 	if (data_fd < 0) {
422 | 		ret = -errno;
423 | 		fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
424 | 			data_path, strerror(errno), errno);
425 | 		return ret;
426 | 	}
427 | 
428 | 	ret = write_new_fs(meta_path, data_path, meta_fd, data_fd,
429 | 			   quorum_count, max_meta_size, max_data_size);
430 | 	close(meta_fd);
431 | 	close(data_fd);
432 | 
433 | 	return ret;
434 | }
435 | 
436 | static void __attribute__((constructor)) mkfs_ctor(void)
437 | {
438 | 	cmd_register("mkfs", "<path>", "write a new file system", mkfs_func);
439 | 
440 | 	/* for lack of some other place to put these.. */
441 | 	build_assert(sizeof(uuid_t) == SCOUTFS_UUID_BYTES);
442 | }
443 | 


--------------------------------------------------------------------------------
/src/parse.c:
--------------------------------------------------------------------------------
  1 | #include <errno.h>
  2 | #include <string.h>
  3 | #include <stdlib.h>
  4 | #include <limits.h>
  5 | #include <stdio.h>
  6 | 
  7 | #include "sparse.h"
  8 | #include "util.h"
  9 | #include "format.h"
 10 | 
 11 | #include "parse.h"
 12 | 
 13 | /*
 14 |  * Convert size with multiplicative suffix to bytes.
 15 |  * e.g. "40M", "10G", "4T"
 16 |  *
 17 |  * These are powers-of-two prefixes - K means 1024 not 1000.
 18 |  *
 19 |  * One can go pretty far with variations but keeping relatively simple for
 20 |  * now: commas, decimals, and multichar suffixes not handled.
 21 |  */
 22 | int parse_human(char* str, u64 *val_ret)
 23 | {
 24 | 	unsigned long long ull;
 25 | 	char *endptr = NULL;
 26 | 	int sh;
 27 | 	int ret = 0;
 28 | 
 29 | 	ull = strtoull(str, &endptr, 0);
 30 | 	if (((ull == LLONG_MIN || ull == LLONG_MAX) &&
 31 | 	     errno == ERANGE)) {
 32 | 		fprintf(stderr, "invalid 64bit value: '%s'\n", str);
 33 | 		*val_ret = 0;
 34 | 		ret = -EINVAL;
 35 | 		goto error;
 36 | 	}
 37 | 
 38 | 	switch (*endptr) {
 39 | 	case 'K':
 40 | 		sh = 10;
 41 | 		break;
 42 | 	case 'M':
 43 | 		sh = 20;
 44 | 		break;
 45 | 	case 'G':
 46 | 		sh = 30;
 47 | 		break;
 48 | 	case 'T':
 49 | 		sh = 40;
 50 | 		break;
 51 | 	case 'P':
 52 | 		sh = 50;
 53 | 		break;
 54 | 	case '\0':
 55 | 		sh = 0;
 56 | 		break;
 57 | 	default:
 58 | 		fprintf(stderr, "unknown suffix: '%s'\n", endptr);
 59 | 		ret = -ERANGE;
 60 | 		goto error;
 61 | 	}
 62 | 
 63 | 	if (ull > (SIZE_MAX >> sh)) {
 64 | 		fprintf(stderr, "size too big: '%s'\n", str);
 65 | 		ret = -ERANGE;
 66 | 		goto error;
 67 | 	}
 68 | 
 69 | 	ull <<= sh;
 70 | 
 71 | 	*val_ret = ull;
 72 | 
 73 | error:
 74 | 	return ret;
 75 | }
 76 | 
 77 | int parse_u64(char *str, u64 *val_ret)
 78 | {
 79 | 	unsigned long long ull;
 80 | 	char *endptr = NULL;
 81 | 
 82 | 	ull = strtoull(str, &endptr, 0);
 83 | 	if (*endptr != '\0' ||
 84 | 	    ((ull == LLONG_MIN || ull == LLONG_MAX) &&
 85 | 	     errno == ERANGE)) {
 86 | 		fprintf(stderr, "invalid 64bit value: '%s'\n", str);
 87 | 		*val_ret = 0;
 88 | 		return -EINVAL;
 89 | 	}
 90 | 
 91 | 	*val_ret = ull;
 92 | 
 93 | 	return 0;
 94 | }
 95 | 
 96 | int parse_s64(char *str, s64 *val_ret)
 97 | {
 98 | 	long long ll;
 99 | 	char *endptr = NULL;
100 | 
101 | 	ll = strtoll(str, &endptr, 0);
102 | 	if (*endptr != '\0' ||
103 | 	    ((ll == LLONG_MIN || ll == LLONG_MAX) &&
104 | 	     errno == ERANGE)) {
105 | 		fprintf(stderr, "invalid 64bit value: '%s'\n", str);
106 | 		*val_ret = 0;
107 | 		return -EINVAL;
108 | 	}
109 | 
110 | 	*val_ret = ll;
111 | 
112 | 	return 0;
113 | }
114 | 
115 | int parse_u32(char *str, u32 *val_ret)
116 | {
117 | 	u64 val;
118 | 	int ret;
119 | 
120 | 	ret = parse_u64(str, &val);
121 | 	if (ret)
122 | 		return ret;
123 | 
124 | 	if (val > UINT_MAX)
125 | 		return -EINVAL;
126 | 
127 | 	*val_ret = val;
128 | 	return 0;
129 | }
130 | 
131 | int parse_timespec(char *str, struct timespec *ts)
132 | {
133 | 	unsigned long long sec;
134 | 	unsigned int nsec;
135 | 	int ret;
136 | 
137 | 	memset(ts, 0, sizeof(struct timespec));
138 | 
139 | 	ret = sscanf(str, "%llu.%u", &sec, &nsec);
140 | 	if (ret != 2)  {
141 | 		fprintf(stderr, "invalid timespec string: '%s'\n", str);
142 | 		return -EINVAL;
143 | 	}
144 | 
145 | 	if (nsec > 1000000000) {
146 | 		fprintf(stderr, "invalid timespec nsec value: '%s'\n", str);
147 | 		return -EINVAL;
148 | 	}
149 | 
150 | 	ts->tv_sec = sec;
151 | 	ts->tv_nsec = nsec;
152 | 
153 | 	return 0;
154 | }
155 | 


--------------------------------------------------------------------------------
/src/parse.h:
--------------------------------------------------------------------------------
 1 | #ifndef _PARSE_H_
 2 | #define _PARSE_H_
 3 | 
 4 | #include <sys/time.h>
 5 | 
 6 | int parse_human(char* str, u64 *val_ret);
 7 | int parse_u64(char *str, u64 *val_ret);
 8 | int parse_s64(char *str, s64 *val_ret);
 9 | int parse_u32(char *str, u32 *val_ret);
10 | int parse_timespec(char *str, struct timespec *ts);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/rand.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | 
 3 | #include "rand.h"
 4 | #include "sparse.h"
 5 | #include "util.h"
 6 | 
 7 | #include <openssl/rand.h>
 8 | 
 9 | void pseudo_random_bytes(void *data, unsigned int len)
10 | {
11 | 	RAND_bytes(data, len);
12 | }
13 | 


--------------------------------------------------------------------------------
/src/rand.h:
--------------------------------------------------------------------------------
 1 | #ifndef _RAND_H_
 2 | #define _RAND_H_
 3 | 
 4 | /*
 5 |  * We could play around a bit with some macros to get aligned constant
 6 |  * word sized buffers filled by single instructions.
 7 |  */
 8 | void pseudo_random_bytes(void *data, unsigned int len);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/search_xattrs.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <getopt.h>
 11 | 
 12 | #include "sparse.h"
 13 | #include "util.h"
 14 | #include "format.h"
 15 | #include "ioctl.h"
 16 | #include "cmd.h"
 17 | 
 18 | static struct option long_ops[] = {
 19 | 	{ "name", 1, NULL, 'n' },
 20 | 	{ "file", 1, NULL, 'f' },
 21 | 	{ NULL, 0, NULL, 0}
 22 | };
 23 | 
 24 | /*
 25 |  * There are significant constant costs to each search call, we
 26 |  * want to get the inodes in as few calls as possible.
 27 |  */
 28 | #define BATCH_SIZE 1000000
 29 | 
 30 | static int search_xattrs_cmd(int argc, char **argv)
 31 | {
 32 | 	struct scoutfs_ioctl_search_xattrs sx;
 33 | 	char *path = NULL;
 34 | 	char *name = NULL;
 35 | 	u64 *inos = NULL;
 36 | 	int fd = -1;
 37 | 	int ret;
 38 | 	int c;
 39 | 	int i;
 40 | 
 41 | 	memset(&sx, 0, sizeof(sx));
 42 | 	inos = malloc(BATCH_SIZE * sizeof(inos[0]));
 43 | 	if (!inos) {
 44 | 		fprintf(stderr, "inos mem alloc failed\n");
 45 | 		ret = -ENOMEM;
 46 | 		goto out;
 47 | 	}
 48 | 
 49 | 	while ((c = getopt_long(argc, argv, "f:n:", long_ops, NULL)) != -1) {
 50 | 		switch (c) {
 51 | 		case 'f':
 52 | 			path = strdup(optarg);
 53 | 			if (!path) {
 54 | 				fprintf(stderr, "path mem alloc failed\n");
 55 | 				ret = -ENOMEM;
 56 | 				goto out;
 57 | 			}
 58 | 			break;
 59 | 		case 'n':
 60 | 			name = strdup(optarg);
 61 | 			if (!name) {
 62 | 				fprintf(stderr, "name mem alloc failed\n");
 63 | 				ret = -ENOMEM;
 64 | 				goto out;
 65 | 			}
 66 | 			break;
 67 | 		case '?':
 68 | 		default:
 69 | 			ret = -EINVAL;
 70 | 			goto out;
 71 | 		}
 72 | 	}
 73 | 
 74 | 	if (path == NULL) {
 75 | 		fprintf(stderr, "must specify -f path to file\n");
 76 | 		ret = -EINVAL;
 77 | 		goto out;
 78 | 	}
 79 | 
 80 | 	if (name == NULL) {
 81 | 		fprintf(stderr, "must specify -n xattr name to search for\n");
 82 | 		ret = -EINVAL;
 83 | 		goto out;
 84 | 	}
 85 | 
 86 | 	fd = open(path, O_RDONLY);
 87 | 	if (fd < 0) {
 88 | 		ret = -errno;
 89 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 90 | 			path, strerror(errno), errno);
 91 | 		goto out;
 92 | 	}
 93 | 
 94 | 	sx.next_ino = 0;
 95 | 	sx.last_ino = U64_MAX;
 96 | 	sx.name_ptr = (unsigned long)name;
 97 | 	sx.inodes_ptr = (unsigned long)inos;
 98 | 	sx.name_bytes = strlen(name);
 99 | 	sx.nr_inodes = BATCH_SIZE;
100 | 
101 | 	do {
102 | 		ret = ioctl(fd, SCOUTFS_IOC_SEARCH_XATTRS, &sx);
103 | 		if (ret == 0)
104 | 			break;
105 | 		if (ret < 0) {
106 | 			ret = -errno;
107 | 			fprintf(stderr, "search_xattrs ioctl failed: "
108 | 				"%s (%d)\n", strerror(errno), errno);
109 | 			goto out;
110 | 		}
111 | 
112 | 		for (i = 0; i < ret; i++)
113 | 			printf("%llu\n", inos[i]);
114 | 
115 | 		sx.next_ino = inos[ret - 1] + 1;
116 | 	} while (!(sx.output_flags & SCOUTFS_SEARCH_XATTRS_OFLAG_END));
117 | 
118 | 	ret = 0;
119 | out:
120 | 	if (fd >= 0)
121 | 		close(fd);
122 | 	free(path);
123 | 	free(name);
124 | 	free(inos);
125 | 
126 | 	return ret;
127 | };
128 | 
129 | static void __attribute__((constructor)) search_xattrs_ctor(void)
130 | {
131 | 	cmd_register("search-xattrs", "-n name -f <path>",
132 | 		     "print inode numbers of inodes which may have given xattr",
133 | 		     search_xattrs_cmd);
134 | }
135 | 


--------------------------------------------------------------------------------
/src/setattr.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <getopt.h>
 11 | #include <assert.h>
 12 | 
 13 | #include "sparse.h"
 14 | #include "util.h"
 15 | #include "format.h"
 16 | #include "ioctl.h"
 17 | #include "parse.h"
 18 | #include "cmd.h"
 19 | 
 20 | static struct option long_ops[] = {
 21 | 	{ "ctime", 1, NULL, 'c' },
 22 | 	{ "data_version", 1, NULL, 'd' },
 23 | 	{ "file", 1, NULL, 'f' },
 24 | 	{ "offline", 0, NULL, 'o' },
 25 | 	{ "i_size", 1, NULL, 's' },
 26 | 	{ NULL, 0, NULL, 0}
 27 | };
 28 | 
 29 | static int setattr_more_cmd(int argc, char **argv)
 30 | {
 31 | 	struct scoutfs_ioctl_setattr_more sm;
 32 | 	struct timespec ctime;
 33 | 	char *path = NULL;
 34 | 	int ret;
 35 | 	int fd = -1;
 36 | 	int c;
 37 | 
 38 | 	memset(&sm, 0, sizeof(sm));
 39 | 
 40 | 	while ((c = getopt_long(argc, argv, "c:d:f:os:", long_ops, NULL)) != -1) {
 41 | 		switch (c) {
 42 | 		case 'c':
 43 | 			ret = parse_timespec(optarg, &ctime);
 44 | 			if (ret)
 45 | 				goto out;
 46 | 			break;
 47 | 		case 'd':
 48 | 			ret = parse_u64(optarg, &sm.data_version);
 49 | 			if (ret)
 50 | 				goto out;
 51 | 			break;
 52 | 		case 'f':
 53 | 			path = strdup(optarg);
 54 | 			if (!path) {
 55 | 				fprintf(stderr, "path mem alloc failed\n");
 56 | 				ret = -ENOMEM;
 57 | 				goto out;
 58 | 			}
 59 | 			break;
 60 | 		case 'o':
 61 | 			sm.flags |= SCOUTFS_IOC_SETATTR_MORE_OFFLINE;
 62 | 			break;
 63 | 		case 's':
 64 | 			ret = parse_u64(optarg, &sm.i_size);
 65 | 			if (ret)
 66 | 				goto out;
 67 | 			break;
 68 | 		case '?':
 69 | 		default:
 70 | 			ret = -EINVAL;
 71 | 			goto out;
 72 | 		}
 73 | 	}
 74 | 
 75 | 	if (path == NULL) {
 76 | 		fprintf(stderr, "must specify -f path to file\n");
 77 | 		ret = -EINVAL;
 78 | 		goto out;
 79 | 	}
 80 | 
 81 | 	fd = open(path, O_WRONLY);
 82 | 	if (fd < 0) {
 83 | 		ret = -errno;
 84 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 85 | 			path, strerror(errno), errno);
 86 | 		goto out;
 87 | 	}
 88 | 
 89 | 	sm.ctime_sec = ctime.tv_sec;
 90 | 	sm.ctime_nsec = ctime.tv_nsec;
 91 | 
 92 | 	ret = ioctl(fd, SCOUTFS_IOC_SETATTR_MORE, &sm);
 93 | 	if (ret < 0) {
 94 | 		ret = -errno;
 95 | 		fprintf(stderr, "setattr_more ioctl failed on '%s': "
 96 | 			"%s (%d)\n", path, strerror(errno), errno);
 97 | 		goto out;
 98 | 	}
 99 | 
100 | 	ret = 0;
101 | out:
102 | 	if (fd >= 0)
103 | 		close(fd);
104 | 	return ret;
105 | }
106 | 
107 | static void __attribute__((constructor)) setattr_more_ctor(void)
108 | {
109 | 	cmd_register("setattr", "-c ctime -d data_version -o -s i_size -f <path>",
110 | 		     "set attributes on file with no data",  
111 | 		     setattr_more_cmd);
112 | }
113 | 


--------------------------------------------------------------------------------
/src/sparse.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SPARSE_H_
  2 | #define _SPARSE_H_
  3 | 
  4 | #include <endian.h>
  5 | #include <stdint.h>
  6 | #include <linux/types.h>
  7 | 
  8 | #ifdef __CHECKER__
  9 | # undef __force
 10 | # define __force		__attribute__((force))
 11 | /* sparse seems to get confused by some builtins */
 12 | extern int __builtin_ia32_rdrand64_step(unsigned long long *);
 13 | extern unsigned int __builtin_ia32_crc32di(unsigned int, unsigned long long);
 14 | extern unsigned int __builtin_ia32_crc32si(unsigned int, unsigned int);
 15 | extern unsigned int __builtin_ia32_crc32hi(unsigned int, unsigned short);
 16 | extern unsigned int __builtin_ia32_crc32qi(unsigned int, unsigned char);
 17 | 
 18 | #else
 19 | # define __force
 20 | #endif
 21 | 
 22 | typedef unsigned char u8;
 23 | typedef unsigned short u16;
 24 | typedef unsigned int u32;
 25 | typedef int s32;
 26 | typedef unsigned long long u64;
 27 | typedef signed long long s64;
 28 | 
 29 | typedef u8 __u8;
 30 | typedef u16 __u16;
 31 | typedef u32 __u32;
 32 | typedef s32 __s32;
 33 | typedef u64 __u64;
 34 | typedef s64 __s64;
 35 | 
 36 | static inline u16 ___swab16(u16 x)
 37 | {
 38 | 	return	((x & (u16)0x00ffU) << 8) |
 39 | 		((x & (u16)0xff00U) >> 8);
 40 | }
 41 | 
 42 | static inline u32 ___swab32(u32 x)
 43 | {
 44 | 	return	((x & (u32)0x000000ffUL) << 24) |
 45 | 		((x & (u32)0x0000ff00UL) << 8) |
 46 | 		((x & (u32)0x00ff0000UL) >> 8) |
 47 | 		((x & (u32)0xff000000UL) >> 24);
 48 | }
 49 | 
 50 | static inline u64 ___swab64(u64 x)
 51 | {
 52 | 	return  (u64)((x & (u64)0x00000000000000ffULL) << 56) |
 53 | 		(u64)((x & (u64)0x000000000000ff00ULL) << 40) |
 54 | 		(u64)((x & (u64)0x0000000000ff0000ULL) << 24) |
 55 | 		(u64)((x & (u64)0x00000000ff000000ULL) << 8) |
 56 | 		(u64)((x & (u64)0x000000ff00000000ULL) >> 8) |
 57 | 		(u64)((x & (u64)0x0000ff0000000000ULL) >> 24) |
 58 | 		(u64)((x & (u64)0x00ff000000000000ULL) >> 40) |
 59 | 		(u64)((x & (u64)0xff00000000000000ULL) >> 56);
 60 | }
 61 | 
 62 | #define __gen_cast_tofrom(end, size)					\
 63 | static inline __##end##size cpu_to_##end##size(u##size x)	\
 64 | {									\
 65 | 	return (__force __##end##size)x;				\
 66 | }									\
 67 | static inline u##size end##size##_to_cpu(__##end##size x)	\
 68 | {									\
 69 | 	return (__force u##size)x;				\
 70 | }
 71 | 
 72 | #define __gen_swap_tofrom(end, size)					\
 73 | static inline __##end##size cpu_to_##end##size(u##size x)	\
 74 | {									\
 75 | 	return (__force __##end##size)___swab##size(x);		\
 76 | }									\
 77 | static inline u##size end##size##_to_cpu(__##end##size x)	\
 78 | {									\
 79 | 	return ___swab##size((__force u##size) x);		\
 80 | }
 81 | 
 82 | #define __gen_functions(which, end)	\
 83 | 	__gen_##which##_tofrom(end, 16)	\
 84 | 	__gen_##which##_tofrom(end, 32)	\
 85 | 	__gen_##which##_tofrom(end, 64)
 86 | 
 87 | #if __BYTE_ORDER == __LITTLE_ENDIAN
 88 | #define __LITTLE_ENDIAN_BITFIELD
 89 | __gen_functions(swap, be)
 90 | __gen_functions(cast, le)
 91 | #elif __BYTE_ORDER == __BIG_ENDIAN
 92 | #define __BIG_ENDIAN_BITFIELD
 93 | __gen_functions(swap, le)
 94 | __gen_functions(cast, be)
 95 | #else
 96 | #error "machine is neither BIG_ENDIAN nor LITTLE_ENDIAN"
 97 | #endif
 98 | 
 99 | #define __gen_add_funcs(end, size)					  \
100 | static inline void end##size##_add_cpu(__##end##size *val, u##size delta) \
101 | {									  \
102 | 	*val = cpu_to_##end##size(end##size##_to_cpu(*val) + delta);	  \
103 | }
104 | 
105 | __gen_add_funcs(le, 16)
106 | __gen_add_funcs(le, 32)
107 | __gen_add_funcs(le, 64)
108 | __gen_add_funcs(be, 16)
109 | __gen_add_funcs(be, 32)
110 | __gen_add_funcs(be, 64)
111 | 
112 | #endif
113 | 


--------------------------------------------------------------------------------
/src/srch.c:
--------------------------------------------------------------------------------
 1 | #include <errno.h>
 2 | 
 3 | #include "sparse.h"
 4 | #include "util.h"
 5 | #include "format.h"
 6 | #include "srch.h"
 7 | 
 8 | /* shifting by width is undefined :/ */
 9 | #define BYTE_MASK(b) ((1ULL << (b << 3)) - 1)
10 | static u64 byte_masks[] = {
11 | 	0, BYTE_MASK(1), BYTE_MASK(2), BYTE_MASK(3),
12 | 	BYTE_MASK(4), BYTE_MASK(5), BYTE_MASK(6), BYTE_MASK(7), U64_MAX,
13 | };
14 | 
15 | static u64 decode_u64(void *buf, int bytes)
16 | {
17 | 	u64 val = get_unaligned_le64(buf) & byte_masks[bytes];
18 | 
19 | 	return (val >> 1) ^ (-(val & 1));
20 | }
21 | 
22 | int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
23 | 		      struct scoutfs_srch_entry *prev)
24 | {
25 | 	u64 diffs[3];
26 | 	u16 lengths;
27 | 	int bytes;
28 | 	int tot;
29 | 	int i;
30 | 
31 | 	lengths = get_unaligned_le16(buf);
32 | 	tot = 2;
33 | 
34 | 	for (i = 0; i < array_size(diffs); i++) {
35 | 		bytes = min(8, lengths & 15);
36 | 		diffs[i] = decode_u64(buf + tot, bytes);
37 | 		tot += bytes;
38 | 		lengths >>= 4;
39 | 	}
40 | 
41 | 	sre->hash = cpu_to_le64(le64_to_cpu(prev->hash) + diffs[0]);
42 | 	sre->ino = cpu_to_le64(le64_to_cpu(prev->ino) + diffs[1]);
43 | 	sre->id = cpu_to_le64(le64_to_cpu(prev->id) + diffs[2]);
44 | 
45 | 	return tot;
46 | }
47 | 


--------------------------------------------------------------------------------
/src/srch.h:
--------------------------------------------------------------------------------
1 | #ifndef _SRCH_H_
2 | #define _SRCH_H_
3 | 
4 | int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
5 | 		      struct scoutfs_srch_entry *prev);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/stage_release.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <limits.h>
 11 | 
 12 | #include "sparse.h"
 13 | #include "util.h"
 14 | #include "format.h"
 15 | #include "ioctl.h"
 16 | #include "cmd.h"
 17 | 
 18 | static int stage_cmd(int argc, char **argv)
 19 | {
 20 | 	struct scoutfs_ioctl_stage args;
 21 | 	unsigned int buf_len = 1024 * 1024;
 22 | 	unsigned int bytes;
 23 | 	char *endptr = NULL;
 24 | 	char *buf = NULL;
 25 | 	int afd = -1;
 26 | 	int fd = -1;
 27 | 	u64 offset;
 28 | 	u64 count;
 29 | 	u64 vers;
 30 | 	int ret;
 31 | 
 32 | 	if (argc != 6) {
 33 | 		fprintf(stderr, "must specify moar args\n");
 34 | 		return -EINVAL;
 35 | 	}
 36 | 
 37 | 	fd = open(argv[1], O_RDWR);
 38 | 	if (fd < 0) {
 39 | 		ret = -errno;
 40 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 41 | 			argv[1], strerror(errno), errno);
 42 | 		return ret;
 43 | 	}
 44 | 
 45 | 	vers = strtoull(argv[2], &endptr, 0);
 46 | 	if (*endptr != '\0' ||
 47 | 	    ((vers == LLONG_MIN || vers == LLONG_MAX) && errno == ERANGE)) {
 48 | 		fprintf(stderr, "error parsing data version '%s'\n",
 49 | 			argv[2]);
 50 | 		ret = -EINVAL;
 51 | 		goto out;
 52 | 	}
 53 | 
 54 | 	offset = strtoull(argv[3], &endptr, 0);
 55 | 	if (*endptr != '\0' ||
 56 | 	    ((offset == LLONG_MIN || offset == LLONG_MAX) && errno == ERANGE)) {
 57 | 		fprintf(stderr, "error parsing offset '%s'\n",
 58 | 			argv[3]);
 59 | 		ret = -EINVAL;
 60 | 		goto out;
 61 | 	}
 62 | 
 63 | 	count = strtoull(argv[4], &endptr, 0);
 64 | 	if (*endptr != '\0' ||
 65 | 	    ((count == LLONG_MIN || count == LLONG_MAX) && errno == ERANGE)) {
 66 | 		fprintf(stderr, "error parsing count '%s'\n",
 67 | 			argv[4]);
 68 | 		ret = -EINVAL;
 69 | 		goto out;
 70 | 	}
 71 | 
 72 | 	if (count > INT_MAX) {
 73 | 		fprintf(stderr, "count %llu too large, limited to %d\n",
 74 | 			count, INT_MAX);
 75 | 		ret = -EINVAL;
 76 | 		goto out;
 77 | 	}
 78 | 
 79 | 	afd = open(argv[5], O_RDONLY);
 80 | 	if (afd < 0) {
 81 | 		ret = -errno;
 82 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 83 | 			argv[5], strerror(errno), errno);
 84 | 		goto out;
 85 | 	}
 86 | 
 87 | 	buf = malloc(buf_len);
 88 | 	if (!buf) {
 89 | 		fprintf(stderr, "couldn't allocate %u byte buffer\n", buf_len);
 90 | 		ret = -ENOMEM;
 91 | 		goto out;
 92 | 	}
 93 | 
 94 | 	while (count) {
 95 | 
 96 | 		bytes = min(count, buf_len);
 97 | 
 98 | 		ret = read(afd, buf, bytes);
 99 | 		if (ret <= 0) {
100 | 			fprintf(stderr, "archive read returned %d: error %s (%d)\n",
101 | 				ret, strerror(errno), errno);
102 | 			ret = -EIO;
103 | 			goto out;
104 | 		}
105 | 
106 | 		bytes = ret;
107 | 
108 | 		args.data_version = vers;
109 | 		args.buf_ptr = (unsigned long)buf;
110 | 		args.offset = offset;
111 | 		args.count = bytes;
112 | 
113 | 		count -= bytes;
114 | 		offset += bytes;
115 | 
116 | 		ret = ioctl(fd, SCOUTFS_IOC_STAGE, &args);
117 | 		if (ret != bytes) {
118 | 			fprintf(stderr, "stage returned %d, not %u: error %s (%d)\n",
119 | 				ret, bytes, strerror(errno), errno);
120 | 			ret = -EIO;
121 | 			goto out;
122 | 		}
123 | 	}
124 | 
125 | 	ret = 0;
126 | out:
127 | 	free(buf);
128 | 	if (fd > -1)
129 | 		close(fd);
130 | 	if (afd > -1)
131 | 		close(afd);
132 | 	return ret;
133 | };
134 | 
135 | static void __attribute__((constructor)) stage_ctor(void)
136 | {
137 | 	cmd_register("stage", "<file> <vers> <offset> <count> <archive file>",
138 | 		     "write archive file contents to offline region", stage_cmd);
139 | }
140 | 
141 | static int release_cmd(int argc, char **argv)
142 | {
143 | 	struct scoutfs_ioctl_release args;
144 | 	char *endptr = NULL;
145 | 	u64 block;
146 | 	u64 count;
147 | 	u64 vers;
148 | 	int ret;
149 | 	int fd;
150 | 
151 | 	if (argc != 5) {
152 | 		fprintf(stderr, "must specify path, data version, offset, and count\n");
153 | 		return -EINVAL;
154 | 	}
155 | 
156 | 	fd = open(argv[1], O_RDWR);
157 | 	if (fd < 0) {
158 | 		ret = -errno;
159 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
160 | 			argv[1], strerror(errno), errno);
161 | 		return ret;
162 | 	}
163 | 
164 | 	vers = strtoull(argv[2], &endptr, 0);
165 | 	if (*endptr != '\0' ||
166 | 	    ((vers == LLONG_MIN || vers == LLONG_MAX) && errno == ERANGE)) {
167 | 		fprintf(stderr, "error parsing data version '%s'\n",
168 | 			argv[2]);
169 | 		ret = -EINVAL;
170 | 		goto out;
171 | 	}
172 | 
173 | 	block = strtoull(argv[3], &endptr, 0);
174 | 	if (*endptr != '\0' ||
175 | 	    ((block == LLONG_MIN || block == LLONG_MAX) && errno == ERANGE)) {
176 | 		fprintf(stderr, "error parsing starting 4K block offset '%s'\n",
177 | 			argv[3]);
178 | 		ret = -EINVAL;
179 | 		goto out;
180 | 	}
181 | 
182 | 	count = strtoull(argv[4], &endptr, 0);
183 | 	if (*endptr != '\0' ||
184 | 	    ((count == LLONG_MIN || count == LLONG_MAX) && errno == ERANGE)) {
185 | 		fprintf(stderr, "error parsing length '%s'\n",
186 | 			argv[4]);
187 | 		ret = -EINVAL;
188 | 		goto out;
189 | 	}
190 | 
191 | 	args.block = block;
192 | 	args.count = count;
193 | 	args.data_version = vers;
194 | 
195 | 	ret = ioctl(fd, SCOUTFS_IOC_RELEASE, &args);
196 | 	if (ret < 0) {
197 | 		ret = -errno;
198 | 		fprintf(stderr, "release ioctl failed: %s (%d)\n",
199 | 			strerror(errno), errno);
200 | 	}
201 | out:
202 | 	close(fd);
203 | 	return ret;
204 | };
205 | 
206 | static void __attribute__((constructor)) release_ctor(void)
207 | {
208 | 	cmd_register("release", "<path> <vers> <4K block offset> <block count>",
209 | 		     "mark file region offline and free extents", release_cmd);
210 | }
211 | 


--------------------------------------------------------------------------------
/src/stat.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <getopt.h>
 11 | #include <assert.h>
 12 | 
 13 | #include "sparse.h"
 14 | #include "util.h"
 15 | #include "format.h"
 16 | #include "ioctl.h"
 17 | #include "cmd.h"
 18 | 
 19 | struct stat_more_field {
 20 | 	char *name;
 21 | 	size_t offset;
 22 | };
 23 | 
 24 | #define FIELD(f, o) {	\
 25 | 	.name = #f,	\
 26 | 	.offset = o,	\
 27 | }
 28 | 
 29 | #define INODE_FIELD_OFF(f) offsetof(struct scoutfs_ioctl_stat_more, f)
 30 | #define INODE_FIELD(f) FIELD(f, INODE_FIELD_OFF(f))
 31 | 
 32 | static struct stat_more_field inode_fields[] = {
 33 | 	INODE_FIELD(meta_seq),
 34 | 	INODE_FIELD(data_seq),
 35 | 	INODE_FIELD(data_version),
 36 | 	INODE_FIELD(online_blocks),
 37 | 	INODE_FIELD(offline_blocks),
 38 | 	{ NULL, }
 39 | };
 40 | 
 41 | static void print_inode_field(void *st, size_t off)
 42 | {
 43 | 	struct scoutfs_ioctl_stat_more *stm = st;
 44 | 
 45 | 	switch(off) {
 46 | 		case INODE_FIELD_OFF(meta_seq):
 47 | 			printf("%llu", stm->meta_seq);
 48 | 			break;
 49 | 		case INODE_FIELD_OFF(data_seq):
 50 | 			printf("%llu", stm->data_seq);
 51 | 			break;
 52 | 		case INODE_FIELD_OFF(data_version):
 53 | 			printf("%llu", stm->data_version);
 54 | 			break;
 55 | 		case INODE_FIELD_OFF(online_blocks):
 56 | 			printf("%llu", stm->online_blocks);
 57 | 			break;
 58 | 		case INODE_FIELD_OFF(offline_blocks):
 59 | 			printf("%llu", stm->offline_blocks);
 60 | 			break;
 61 | 	};
 62 | }
 63 | 
 64 | #define FS_FIELD_OFF(f) offsetof(struct scoutfs_ioctl_statfs_more, f)
 65 | #define FS_FIELD(f) FIELD(f, FS_FIELD_OFF(f))
 66 | 
 67 | static struct stat_more_field fs_fields[] = {
 68 | 	FS_FIELD(fsid),
 69 | 	FS_FIELD(rid),
 70 | 	FS_FIELD(committed_seq),
 71 | 	FS_FIELD(total_meta_blocks),
 72 | 	FS_FIELD(total_data_blocks),
 73 | 	{ NULL, }
 74 | };
 75 | 
 76 | static void print_fs_field(void *st, size_t off)
 77 | {
 78 | 	struct scoutfs_ioctl_statfs_more *sfm = st;
 79 | 
 80 | 	switch(off) {
 81 | 		case FS_FIELD_OFF(fsid):
 82 | 			printf("%016llx", sfm->fsid);
 83 | 			break;
 84 | 		case FS_FIELD_OFF(rid):
 85 | 			printf("%016llx", sfm->rid);
 86 | 			break;
 87 | 		case FS_FIELD_OFF(committed_seq):
 88 | 			printf("%llu", sfm->committed_seq);
 89 | 			break;
 90 | 		case FS_FIELD_OFF(total_meta_blocks):
 91 | 			printf("%llu", sfm->total_meta_blocks);
 92 | 			break;
 93 | 		case FS_FIELD_OFF(total_data_blocks):
 94 | 			printf("%llu", sfm->total_data_blocks);
 95 | 			break;
 96 | 	};
 97 | }
 98 | 
 99 | #define for_each_field(f, fields) \
100 | 	for (f = fields; f->name; f++)
101 | 
102 | typedef void (*print_field_t)(void *st, size_t off);
103 | 
104 | static struct option long_ops[] = {
105 | 	{ "single_field", 1, NULL, 's' },
106 | 	{ NULL, 0, NULL, 0}
107 | };
108 | 
109 | static int do_stat(int argc, char **argv, int is_inode)
110 | {
111 | 	union {
112 | 		struct scoutfs_ioctl_stat_more stm;
113 | 		struct scoutfs_ioctl_statfs_more sfm;
114 | 	} st;
115 | 	struct stat_more_field *single = NULL;
116 | 	struct stat_more_field *fields;
117 | 	struct stat_more_field *fi;
118 | 	char *single_name = NULL;
119 | 	print_field_t pr = NULL;
120 | 	char *path;
121 | 	int cmd;
122 | 	int ret;
123 | 	int fd;
124 | 	int i;
125 | 	int c;
126 | 
127 | 	memset(&st, 0, sizeof(st));
128 | 	if (is_inode) {
129 | 		cmd = SCOUTFS_IOC_STAT_MORE;
130 | 		fields = inode_fields;
131 | 		st.stm.valid_bytes = sizeof(struct scoutfs_ioctl_stat_more);
132 | 		pr = print_inode_field;
133 | 	} else {
134 | 		cmd = SCOUTFS_IOC_STATFS_MORE;
135 | 		fields = fs_fields;
136 | 		st.sfm.valid_bytes = sizeof(struct scoutfs_ioctl_statfs_more);
137 | 		pr = print_fs_field;
138 | 	}
139 | 
140 | 	while ((c = getopt_long(argc, argv, "s:", long_ops, NULL)) != -1) {
141 | 		switch (c) {
142 | 		case 's':
143 | 			single_name = strdup(optarg);
144 | 			assert(single_name);
145 | 			break;
146 | 		case '?':
147 | 		default:
148 | 			return -EINVAL;
149 | 		}
150 | 	}
151 | 
152 | 	if (single_name) {
153 | 		for_each_field(fi, fields) {
154 | 			if (strcmp(fi->name, single_name) == 0) {
155 | 				single = fi;
156 | 				break;
157 | 			}
158 | 		}
159 | 		if (!single) {
160 | 			fprintf(stderr, "unknown field: '%s'\n", single_name);
161 | 			return -EINVAL;
162 | 		}
163 | 	}
164 | 
165 | 	if (optind >= argc) {
166 | 		fprintf(stderr, "must specify at least one path argument\n");
167 | 		return -EINVAL;
168 | 	}
169 | 
170 | 	for (i = optind; i < argc; i++) {
171 | 		path = argv[i];
172 | 
173 | 		fd = open(path, O_RDONLY);
174 | 		if (fd < 0) {
175 | 			ret = -errno;
176 | 			fprintf(stderr, "failed to open '%s': %s (%d)\n",
177 | 				path, strerror(errno), errno);
178 | 			continue;
179 | 		}
180 | 
181 | 		ret = ioctl(fd, cmd, &st);
182 | 		if (ret < 0) {
183 | 			ret = -errno;
184 | 			fprintf(stderr, "ioctl failed on '%s': "
185 | 				"%s (%d)\n", path, strerror(errno), errno);
186 | 
187 | 		} else if (single) {
188 | 			pr(&st, single->offset);
189 | 			printf("\n");
190 | 		} else {
191 | 			printf("%-17s %s\n", "path", path);
192 | 			for_each_field(fi, fields) {
193 | 				printf("%-17s ", fi->name);
194 | 				pr(&st, fi->offset);
195 | 				printf("\n");
196 | 			}
197 | 		}
198 | 
199 | 		close(fd);
200 | 	}
201 | 
202 | 	return 0;
203 | }
204 | 
205 | static int stat_more_cmd(int argc, char **argv)
206 | {
207 | 	return do_stat(argc, argv, 1);
208 | }
209 | 
210 | static int statfs_more_cmd(int argc, char **argv)
211 | {
212 | 	return do_stat(argc, argv, 0);
213 | }
214 | 
215 | static void __attribute__((constructor)) stat_more_ctor(void)
216 | {
217 | 	cmd_register("stat", "<path>",
218 | 		     "show scoutfs inode information", stat_more_cmd);
219 | }
220 | 
221 | static void __attribute__((constructor)) statfs_more_ctor(void)
222 | {
223 | 	cmd_register("statfs", "<path>",
224 | 		     "show scoutfs file system information", statfs_more_cmd);
225 | }
226 | 


--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef _UTIL_H_
  2 | #define _UTIL_H_
  3 | 
  4 | #include <unistd.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <math.h>
  8 | 
  9 | #include "sparse.h"
 10 | 
 11 | /*
 12 |  * Generate build warnings if the condition is false but generate no
 13 |  * code at run time if it's true.
 14 |  */
 15 | #define build_assert(cond) ((void)sizeof(char[1 - 2*!(cond)]))
 16 | 
 17 | #define min(a, b) 		\
 18 | ({				\
 19 | 	__typeof__(a) _a = (a);	\
 20 | 	__typeof__(b) _b = (b);	\
 21 | 				\
 22 | 	_a < _b ? _a : _b;	\
 23 | })
 24 | 
 25 | #define max(a, b) 		\
 26 | ({				\
 27 | 	__typeof__(a) _a = (a);	\
 28 | 	__typeof__(b) _b = (b);	\
 29 | 				\
 30 | 	_a > _b ? _a : _b;	\
 31 | })
 32 | 
 33 | #define swap(a, b) 		\
 34 | do {				\
 35 | 	__typeof__(a) _t = (a);	\
 36 |  	(a) = (b);		\
 37 |  	(b) = (_t);		\
 38 | } while (0)
 39 | 
 40 | #define array_size(arr) (sizeof(arr) / sizeof(arr[0]))
 41 | 
 42 | #define __packed __attribute__((packed))
 43 | 
 44 | /*
 45 |  * Round the 'a' value up to the next 'b' power of two boundary.  It
 46 |  * casts the mask to the value type before masking to avoid truncation
 47 |  * problems.
 48 |  */
 49 | #define round_up(a, b)			\
 50 | ({					\
 51 | 	__typeof__(a) _b = (b);		\
 52 | 					\
 53 | 	((a) + _b - 1) & ~(_b - 1);	\
 54 | })
 55 | #define round_down(a, b)		\
 56 | ({					\
 57 | 	__typeof__(a) _b = (b);		\
 58 | 					\
 59 | 	((a) & ~(_b - 1));		\
 60 | })
 61 | 
 62 | #define DIV_ROUND_UP(x, y)  (((x) + (y) - 1) / (y))
 63 | #define ALIGN(x, y)  (((x) + (y) - 1) & ~((y) - 1))
 64 | 
 65 | #ifndef offsetof
 66 | #define offsetof(type, memb) ((unsigned long)&((type *)0)->memb)
 67 | #endif
 68 | 
 69 | #define container_of(ptr, type, memb) \
 70 | 	((type *)((void *)(ptr) - offsetof(type, memb)))
 71 | 
 72 | #define BITS_PER_LONG (sizeof(long) * 8)
 73 | #define U8_MAX ((u8)~0ULL)
 74 | #define U16_MAX ((u16)~0ULL)
 75 | #define U32_MAX ((u32)~0ULL)
 76 | #define U64_MAX ((u64)~0ULL)
 77 | 
 78 | #define flsll(x)					\
 79 | ({							\
 80 | 	unsigned long long _x = (x);			\
 81 | 							\
 82 | 	(_x == 0 ? 0 : 64 - __builtin_clzll(_x));	\
 83 | })
 84 | 
 85 | #define ilog2(x)					\
 86 | ({							\
 87 | 	((unsigned long)log2l((long double)x));		\
 88 | })
 89 | 
 90 | #define emit_get_unaligned_le(nr)			\
 91 | static inline __u##nr get_unaligned_le##nr(void *buf)	\
 92 | {							\
 93 | 	__le##nr x;					\
 94 | 	memcpy(&x, buf, sizeof(x));			\
 95 | 	return le##nr##_to_cpu(x);			\
 96 | }
 97 | emit_get_unaligned_le(16)
 98 | emit_get_unaligned_le(32)
 99 | emit_get_unaligned_le(64)
100 | 
101 | /*
102 |  * return -1,0,+1 based on the memcmp comparison of the minimum of their
103 |  * two lengths.  If their min shared bytes are equal but the lengths
104 |  * are not then the larger length is considered greater.
105 |  */
106 | static inline int memcmp_lens(const void *a, int a_len,
107 | 			      const void *b, int b_len)
108 | {
109 | 	unsigned int len = min(a_len, b_len);
110 | 
111 | 	return memcmp(a, b, len) ?: a_len - b_len;
112 | }
113 | 
114 | #endif
115 | 


--------------------------------------------------------------------------------
/src/waiting.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <limits.h>
 11 | 
 12 | #include "sparse.h"
 13 | #include "util.h"
 14 | #include "format.h"
 15 | #include "ioctl.h"
 16 | #include "cmd.h"
 17 | #include "parse.h"
 18 | 
 19 | #ifndef MAX_ERRNO
 20 | #define MAX_ERRNO 4095
 21 | #endif
 22 | 
 23 | #define OP_FMT "%s%s"
 24 | 
 25 | /*
 26 |  * Print the caller's string for the bit if it's set, and if it's set
 27 |  * and there are more significant bits coming then we also print a
 28 |  * separating comma.
 29 |  */
 30 | #define op_str(ops, bit, str)				\
 31 | 	(((ops) & (bit)) ? (str) : ""),			\
 32 | 	(((ops) & (bit)) && ((ops) & ~(((bit) << 1) - 1)) ? "," : "")
 33 | 
 34 | static int waiting_cmd(int argc, char **argv)
 35 | {
 36 | 	struct scoutfs_ioctl_data_waiting_entry dwe[16];
 37 | 	struct scoutfs_ioctl_data_waiting idw;
 38 | 	int ret;
 39 | 	int fd;
 40 | 	int i;
 41 | 
 42 | 	if (argc != 4) {
 43 | 		fprintf(stderr, "must specify ino, iblock, and path\n");
 44 | 		return -EINVAL;
 45 | 	}
 46 | 
 47 | 	ret = parse_u64(argv[1], &idw.after_ino) ?:
 48 | 	      parse_u64(argv[2], &idw.after_iblock);
 49 | 	if (ret)
 50 | 		return ret;
 51 | 
 52 | 	fd = open(argv[3], O_RDONLY);
 53 | 	if (fd < 0) {
 54 | 		ret = -errno;
 55 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
 56 | 			argv[3], strerror(errno), errno);
 57 | 		return ret;
 58 | 	}
 59 | 
 60 | 	idw.flags = 0;
 61 | 	idw.ents_ptr = (unsigned long)dwe;
 62 | 	idw.ents_nr = array_size(dwe);
 63 | 
 64 | 	for (;;) {
 65 | 		ret = ioctl(fd, SCOUTFS_IOC_DATA_WAITING, &idw);
 66 | 		if (ret < 0) {
 67 | 			ret = -errno;
 68 | 			fprintf(stderr, "waiting ioctl failed: %s (%d)\n",
 69 | 				strerror(errno), errno);
 70 | 			break;
 71 | 		} else if (ret == 0) {
 72 | 			break;
 73 | 		}
 74 | 
 75 | 		for (i = 0; i < ret; i++)
 76 | 			printf("ino %llu iblock %llu ops "
 77 | 			       OP_FMT OP_FMT OP_FMT"\n",
 78 | 			       dwe[i].ino, dwe[i].iblock,
 79 | 			       op_str(dwe[i].op, SCOUTFS_IOC_DWO_READ,
 80 | 				      "read"),
 81 | 			       op_str(dwe[i].op, SCOUTFS_IOC_DWO_WRITE,
 82 | 				      "write"),
 83 | 			       op_str(dwe[i].op, SCOUTFS_IOC_DWO_CHANGE_SIZE,
 84 | 				      "change_size"));
 85 | 
 86 | 		idw.after_ino = dwe[i - 1].ino;
 87 | 		idw.after_iblock = dwe[i - 1].iblock;
 88 | 	}
 89 | 
 90 | 	close(fd);
 91 | 	return ret;
 92 | };
 93 | 
 94 | static void __attribute__((constructor)) waiting_ctor(void)
 95 | {
 96 | 	cmd_register("data-waiting", "<ino> <iblock> <path>",
 97 | 		     "print ops waiting for data blocks", waiting_cmd);
 98 | }
 99 | 
100 | static int data_wait_err_cmd(int argc, char **argv)
101 | {
102 | 	struct scoutfs_ioctl_data_wait_err args;
103 | 	int fd = -1;
104 | 	int ret;
105 | 
106 | 	memset(&args, 0, sizeof(args));
107 | 
108 | 	if (argc != 8) {
109 | 		fprintf(stderr, "must specify path, ino, version, offset, count,op, and err\n");
110 | 		return -EINVAL;
111 | 	}
112 | 
113 | 	ret = parse_u64(argv[2], &args.ino) ?:
114 | 	      parse_u64(argv[3], &args.data_version) ?:
115 | 	      parse_u64(argv[4], &args.offset) ?:
116 | 	      parse_u64(argv[5], &args.count) ?:
117 | 	      parse_s64(argv[7], &args.err);
118 | 	if (ret)
119 | 		return ret;
120 | 
121 | 	if ((args.err >= 0) || (args.err < -MAX_ERRNO)) {
122 | 		fprintf(stderr, "err %lld invalid\n", args.err);
123 | 		ret = -EINVAL;
124 | 		goto out;
125 | 	}
126 | 
127 | 	if (!strcmp(argv[6], "read")) {
128 | 		args.op = SCOUTFS_IOC_DWO_READ;
129 | 	} else if (!strcmp(argv[6], "write")) {
130 | 		args.op = SCOUTFS_IOC_DWO_WRITE;
131 | 	} else if (!strcmp(argv[6], "change_size")) {
132 | 		args.op = SCOUTFS_IOC_DWO_CHANGE_SIZE;
133 | 	} else {
134 | 		fprintf(stderr, "invalid data wait op: '%s'\n", argv[6]);
135 | 		return -EINVAL;
136 | 	}
137 | 
138 | 	fd = open(argv[1], O_RDONLY);
139 | 	if (fd < 0) {
140 | 		ret = -errno;
141 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
142 | 			argv[1], strerror(errno), errno);
143 | 		return ret;
144 | 	}
145 | 
146 | 	ret = ioctl(fd, SCOUTFS_IOC_DATA_WAIT_ERR, &args);
147 | 	if (ret < 0) {
148 | 		fprintf(stderr, "data_wait_err returned %d: error %s (%d)\n",
149 | 			ret, strerror(errno), errno);
150 | 		ret = -EIO;
151 | 		goto out;
152 | 	}
153 | 	printf("data_wait_err found %d waiters.\n", ret);
154 | 
155 | out:
156 | 	if (fd > -1)
157 | 		close(fd);
158 | 	return ret;
159 | };
160 | 
161 | static void __attribute__((constructor)) data_wait_err_ctor(void)
162 | {
163 | 	cmd_register("data-wait-err", "<path> <ino> <vers> <offset> <count> <op> <err>",
164 | 		     "return error from matching waiters",
165 | 		     data_wait_err_cmd);
166 | }
167 | 


--------------------------------------------------------------------------------
/src/walk_inodes.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <sys/ioctl.h>
  7 | #include <fcntl.h>
  8 | #include <errno.h>
  9 | #include <string.h>
 10 | #include <limits.h>
 11 | 
 12 | #include "sparse.h"
 13 | #include "util.h"
 14 | #include "format.h"
 15 | #include "ioctl.h"
 16 | #include "cmd.h"
 17 | 
 18 | /*
 19 |  * Parse the command line specification of a walk inodes entry of the
 20 |  * form "major.minor.ino".  At least one value must be given, the rest
 21 |  * default to 0.
 22 |  */
 23 | static int parse_walk_entry(struct scoutfs_ioctl_walk_inodes_entry *ent,
 24 | 			    char *str)
 25 | {
 26 | 	char *endptr;
 27 | 	char *c;
 28 | 	u64 ull;
 29 | 	u64 minor = 0;
 30 | 	u64 *val;
 31 | 
 32 | 	memset(ent, 0, sizeof(*ent));
 33 | 	val = &ent->major;
 34 | 
 35 | 	for (;;) {
 36 | 		c = index(str, '.');
 37 | 		if (c)
 38 | 			*c = '\0';
 39 | 
 40 | 		endptr = NULL;
 41 | 		ull = strtoull(str, &endptr, 0);
 42 | 		if (*endptr != '\0' ||
 43 | 		    ((ull == LLONG_MIN || ull == LLONG_MAX) &&
 44 | 		     errno == ERANGE) ||
 45 | 		    (val == &minor && (*val < INT_MIN || *val > INT_MAX))) {
 46 | 			fprintf(stderr, "bad index pos at '%s'\n", str);
 47 | 			return -EINVAL;
 48 | 		}
 49 | 
 50 | 		*val = ull;
 51 | 
 52 | 		if (val == &ent->major)
 53 | 			val = &minor;
 54 | 		else if (val == &minor)
 55 | 			val = &ent->ino;
 56 | 		else
 57 | 			break;
 58 | 
 59 | 		if (c)
 60 | 			str = c + 1;
 61 | 		else
 62 | 			break;
 63 | 	}
 64 | 
 65 | 	ent->minor = minor;
 66 | 	return 0;
 67 | }
 68 | 
 69 | static int walk_inodes_cmd(int argc, char **argv)
 70 | {
 71 | 	struct scoutfs_ioctl_walk_inodes_entry ents[128];
 72 | 	struct scoutfs_ioctl_walk_inodes walk;
 73 | 	u64 total = 0;
 74 | 	int ret;
 75 | 	int fd;
 76 | 	int i;
 77 | 
 78 | 	if (argc != 5) {
 79 | 		fprintf(stderr, "must specify seq and path\n");
 80 | 		return -EINVAL;
 81 | 	}
 82 | 
 83 | 	if (!strcasecmp(argv[1], "meta_seq"))
 84 | 		walk.index = SCOUTFS_IOC_WALK_INODES_META_SEQ;
 85 | 	else if (!strcasecmp(argv[1], "data_seq"))
 86 | 		walk.index = SCOUTFS_IOC_WALK_INODES_DATA_SEQ;
 87 | 	else {
 88 | 		fprintf(stderr, "unknown index '%s', try 'meta_seq' or "
 89 | 				"'data_seq'\n", argv[1]);
 90 | 		return -EINVAL;
 91 | 	}
 92 | 
 93 | 	ret = parse_walk_entry(&walk.first, argv[2]);
 94 | 	if (ret) {
 95 | 		fprintf(stderr, "invalid first position '%s', try '1.2.3' or "
 96 | 			"'-1'\n", argv[2]);
 97 | 		return -EINVAL;
 98 | 
 99 | 	}
100 | 
101 | 	ret = parse_walk_entry(&walk.last, argv[3]);
102 | 	if (ret) {
103 | 		fprintf(stderr, "invalid last position '%s', try '1.2.3' or "
104 | 			"'-1'\n", argv[3]);
105 | 		return -EINVAL;
106 | 
107 | 	}
108 | 
109 | 	fd = open(argv[4], O_RDONLY);
110 | 	if (fd < 0) {
111 | 		ret = -errno;
112 | 		fprintf(stderr, "failed to open '%s': %s (%d)\n",
113 | 			argv[4], strerror(errno), errno);
114 | 		return ret;
115 | 	}
116 | 
117 | 	walk.entries_ptr = (unsigned long)ents;
118 | 	walk.nr_entries = array_size(ents);
119 | 
120 | 	for (;;) {
121 | 		ret = ioctl(fd, SCOUTFS_IOC_WALK_INODES, &walk);
122 | 		if (ret < 0) {
123 | 			ret = -errno;
124 | 			fprintf(stderr, "walk_inodes ioctl failed: %s (%d)\n",
125 | 				strerror(errno), errno);
126 | 			break;
127 | 		} else if (ret == 0) {
128 | 			break;
129 | 		}
130 | 
131 | 		for (i = 0; i < ret; i++) {
132 | 			if ((total + i) % 25 == 0)
133 | 				printf("%-20s %-20s %-10s %-20s\n",
134 | 				       "#", "major", "minor", "ino");
135 | 
136 | 			printf("%-20llu %-20llu %-10u %-20llu\n",
137 | 			       total + i, ents[i].major, ents[i].minor,
138 | 			       ents[i].ino);
139 | 		}
140 | 
141 | 		total += i;
142 | 
143 | 		walk.first = ents[i - 1];
144 | 		if (++walk.first.ino == 0 && ++walk.first.minor == 0)
145 | 			walk.first.major++;
146 | 	}
147 | 
148 | 	close(fd);
149 | 	return ret;
150 | };
151 | 
152 | static void __attribute__((constructor)) walk_inodes_ctor(void)
153 | {
154 | 	cmd_register("walk-inodes", "<index> <first> <last> <path>",
155 | 		     "print range of indexed inodes", walk_inodes_cmd);
156 | }
157 | 


--------------------------------------------------------------------------------
/tex/.gitignore:
--------------------------------------------------------------------------------
1 | missfont.log
2 | *.fls
3 | *.aux
4 | *.d
5 | *.d
6 | *.fdb_latexmk
7 | *.log
8 | *.pdf
9 | 


--------------------------------------------------------------------------------
/tex/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # # dnf install latexmk texlive
 3 | # # make
 4 | #
 5 | # Tools
 6 | LATEXMK = latexmk
 7 | RM = rm -f
 8 | 
 9 | # Project-specific settings
10 | DOCNAME = scoutfs
11 | 
12 | # Targets
13 | all: doc
14 | doc: pdf
15 | pdf: $(DOCNAME).pdf
16 | 
17 | # Rules
18 | %.pdf: %.tex
19 | 	$(LATEXMK) -pdf -M -MP -MF $*.d $*
20 | 
21 | mostlyclean:
22 | 	$(LATEXMK) -silent -c
23 | 	$(RM) *.bbl
24 | 
25 | clean: mostlyclean
26 | 	$(LATEXMK) -silent -C
27 | 	$(RM) *.run.xml *.synctex.gz
28 | 	$(RM) *.d
29 | 
30 | .PHONY: all clean doc mostlyclean pdf
31 | 
32 | # Include auto-generated dependencies
33 | -include *.d
34 | 


--------------------------------------------------------------------------------
/tex/scoutfs.tex:
--------------------------------------------------------------------------------
  1 | % This was derived from the usenix templates, whose introductory
  2 | % comment is as follows:
  3 | %
  4 | % TEMPLATE for Usenix papers, specifically to meet requirements of
  5 | %  USENIX '05
  6 | % originally a template for producing IEEE-format articles using LaTeX.
  7 | %   written by Matthew Ward, CS Department, Worcester Polytechnic Institute.
  8 | % adapted by David Beazley for his excellent SWIG paper in Proceedings,
  9 | %   Tcl 96
 10 | % turned into a smartass generic template by De Clarke, with thanks to
 11 | %   both the above pioneers
 12 | % use at your own risk.  Complaints to /dev/null.
 13 | % make it two column with no page numbering, default is 10 point
 14 | 
 15 | % Munged by Fred Douglis <douglis@research.att.com> 10/97 to separate
 16 | % the .sty file from the LaTeX source template, so that people can
 17 | % more easily include the .sty file into an existing document.  Also
 18 | % changed to more closely follow the style guidelines as represented
 19 | % by the Word sample file. 
 20 | 
 21 | % Note that since 2010, USENIX does not require endnotes. If you want
 22 | % foot of page notes, don't include the endnotes package in the 
 23 | % usepackage command, below.
 24 | 
 25 | % This version uses the latex2e styles, not the very ancient 2.09 stuff.
 26 | \documentclass[letterpaper,twocolumn,10pt]{article}
 27 | \usepackage{usenix2019,epsfig}
 28 | \begin{document}
 29 | 
 30 | %don't want date printed
 31 | \date{}
 32 | 
 33 | %make title bold and 14 pt font (Latex default is non-bold, 16 pt)
 34 | \title{\Large \bf scoutfs : A Scalable Archival Filesystem}
 35 | 
 36 | %for single author (just remove % characters)
 37 | \author{
 38 | {\rm Zach Brown}\\
 39 | Versity Software, Inc.
 40 | }
 41 | 
 42 | \maketitle
 43 | 
 44 | % Use the following at camera-ready time to suppress page numbers.
 45 | % Comment it out when you first submit the paper for review.
 46 | % \thispagestyle{empty}
 47 | 
 48 | \section{Metadata Items}
 49 | 
 50 | scoutfs stores filesystem metadata in items that are identified by a
 51 | key and contain a variable length value payload.\\
 52 | 
 53 | Every key uses a generic structure with a fixed number of fields.
 54 | 
 55 | {\tt \small
 56 | \begin{verbatim}
 57 | struct scoutfs_key {
 58 |         __u8    sk_zone;
 59 |         __le64  _sk_first;
 60 |         __u8    sk_type;
 61 |         __le64  _sk_second;
 62 |         __le64  _sk_third;
 63 |         __u8    _sk_fourth;
 64 | };
 65 | \end{verbatim}
 66 | }
 67 | 
 68 | Using a shared key struct lets us sort all the metadata items in the
 69 | filesystem in one key space regardless of their form or function.  The
 70 | generic keys are displayed, sorted, and computed (incrementing, finding
 71 | difference) without needing to know the specific fields of each item
 72 | type.
 73 | 
 74 | Different structures are identified by their zone and type pair.  They
 75 | then map their type's fields to the remaining generic fields to
 76 | determine the sorting of the item keys within their type.
 77 | 
 78 | For example, when storing inodes we use the {\tt SCOUTFS\_FS\_ZONE} and
 79 | {\tt SCOUTFS\_INODE\_TYPE} and put the inode number in the first generic
 80 | key field.
 81 | 
 82 | {\tt \small
 83 | \begin{verbatim}
 84 |         #define ski_ino _sk_first
 85 | \end{verbatim}
 86 | }
 87 | 
 88 | {\tt \small
 89 | \begin{verbatim}
 90 |         key.sk_zone = SCOUTFS_FS_ZONE;
 91 |         key.ski_ino = ino;
 92 |         key.sk_type = SCOUTFS_INODE_TYPE;
 93 | \end{verbatim}
 94 | }
 95 | 
 96 | Continuing this example, metadata that is associated with inodes also
 97 | use the {\tt SCOUTFS\_FS\_ZONE} and store the inode number in {\tt
 98 | \_sk\_first} but then have different type values.  For example {\tt
 99 | SCOUTFS\_XATTR\_TYPE} or {\tt SCOUTFS\_SYMLINK\_TYPE}.  When the items'
100 | keys are sorted we end up with all the items for a given inode stored
101 | near each other.
102 | 
103 | \subsection{Directory Entries}
104 | 
105 | A directory entry is stored in three different metadata items, each with
106 | a different key and used for a different purpose.  Each item shares the
107 | same key format and directory entry value payload, however.
108 | 
109 | The key stores the entry's directory inode number and major and minor
110 | values associated with the type of directory entry being stored.
111 | 
112 | {\tt \small
113 | \begin{verbatim}
114 |         #define skd_ino         _sk_first
115 |         #define skd_major       _sk_second
116 |         #define skd_minor       _sk_third
117 | \end{verbatim}
118 | }
119 | 
120 | The value contains a directory entry struct with all the metadata
121 | associated with a directory entry, including the full entry name.
122 | 
123 | {\tt \small
124 | \begin{verbatim}
125 | struct scoutfs_dirent {
126 |         __le64 ino;
127 |         __le64 hash;
128 |         __le64 pos;
129 |         __u8 type;
130 |         __u8 name[0];
131 | };
132 | \end{verbatim}
133 | }
134 | 
135 | Each item contains a full copy of the item value.  This duplicates
136 | storage across each item type but also lets each operation be satisfied
137 | by one item lookup.  Once the item value is obtained its fields can be
138 | used to construct the keys for each of the items associated with the
139 | entry.
140 | 
141 | \subsubsection{Directory Entry Lookup Items}
142 | 
143 | {\tt \small
144 | \begin{verbatim}
145 |         key.sk_zone = SCOUTFS_FS_ZONE;
146 |         key.skd_ino = dir_ino;
147 |         key.sk_type = SCOUTFS_DIRENT_TYPE;
148 |         key.skd_major = hash(entry_name);
149 |         key.skd_minor = dir_pos;
150 | \end{verbatim}
151 | }
152 | 
153 | Lookup entries are stored in the parent directory at the hash of the
154 | name of the entry.  These entries are used to map names to inode numbers
155 | during path traversal.
156 | 
157 | The major key value is set to a 64bit hash of the file name.  These hash
158 | values can collide so the minor key value is set to the readdir position
159 | in the directory of the entry.  This readdir position is unique for
160 | every entry and ensures that keys are unique when hash values collide.
161 | 
162 | A name lookup is performed by iterating over all the keys with the major
163 | that matches the hashed name.  The full name in the dirent value struct
164 | is compared to the search name.  It will be very rare to have more than
165 | one item with a given hash value.
166 | 
167 | \subsubsection{Directory Entry Readdir Items}
168 | 
169 | {\tt \small
170 | \begin{verbatim}
171 |         key.sk_zone = SCOUTFS_FS_ZONE;
172 |         key.skd_ino = dir_ino;
173 |         key.sk_type = SCOUTFS_READDIR_TYPE;
174 |         key.skd_major = dir_pos;
175 |         key.skd_minor = 0;
176 | \end{verbatim}
177 | }
178 | 
179 | Readdir entries are used to iterate over entries for the readdir()
180 | call.  By providing a unique 64bit {\tt dir\_pos} for each entry we avoid
181 | having to track multiple entries for a given readdir position value.
182 | 
183 | readdir() returns entries in {\tt dir\_pos} order which depends on entry
184 | creation order and matches inode allocation order.  Accessing the inodes
185 | that are referenced by the entries returned from readdir() will result
186 | in efficient forward iteration over the readdir and inode items,
187 | assuming that files were simply created.
188 | 
189 | Renaming files or creating hard links to existing files creates a new
190 | entry but can't reassign the inode number and can result in mismatched
191 | access patterns of the readdir entry items and the inode items.
192 | 
193 | \subsubsection{Directory Entry Link Backref Items}
194 | 
195 | {\tt \small
196 | \begin{verbatim}
197 |         key.sk_zone = SCOUTFS_FS_ZONE;
198 |         key.skd_ino = target_ino;
199 |         key.sk_type = SCOUTFS_LINK_BACKREF_TYPE;
200 |         key.skd_major = dir_ino;
201 |         key.skd_minor = dir_pos;
202 | \end{verbatim}
203 | }
204 | 
205 | Link backref entry items are stored with the target inode number and the
206 | inode number and readdir position of the entry in its directory.
207 | They're used to iterate over all the entries that refer to a given
208 | inode.  Full relative paths from the root directory to a target inode
209 | can be constructed by walking up through each parent entry as its
210 | discovered.
211 | 
212 | Both inode numbers and readdir positions are allocated by strictly
213 | increasing the next free number.  Old inode numbers or readdir positions
214 | are never reused.  This means that resolving paths for existing inodes
215 | will always walk keys that are strictly sorted less than the keys that
216 | will be created as new files are created.  This tends to isolate read
217 | access patterns during backround archival policy processing from write
218 | access patterns during new file creation and increases performance by
219 | reducing contention.
220 | 
221 | \end{document}
222 | 


--------------------------------------------------------------------------------
/tex/usenix2019.sty:
--------------------------------------------------------------------------------
 1 | % usenix.sty - to be used with latex2e for USENIX.
 2 | % To use this style file, look at the template usenix_template.tex
 3 | %
 4 | % $Id: usenix.sty,v 1.2 2005/02/16 22:30:47 maniatis Exp $
 5 | %
 6 | % The following definitions are modifications of standard article.sty
 7 | % definitions, arranged to do a better job of matching the USENIX
 8 | % guidelines.
 9 | % It will automatically select two-column mode and the Times-Roman
10 | % font.
11 | 
12 | %
13 | % USENIX papers are two-column.
14 | % Times-Roman font is nice if you can get it (requires NFSS,
15 | % which is in latex2e.
16 | 
17 | \if@twocolumn\else\input twocolumn.sty\fi
18 | \usepackage{mathptmx}  % times roman, including math (where possible)
19 | 
20 | %
21 | % USENIX wants margins of: 0.75" sides, 1" bottom, and 1" top.
22 | % 0.33" gutter between columns.
23 | % Gives active areas of 7" x 9"
24 | %
25 | \setlength{\textheight}{9.0in}
26 | \setlength{\columnsep}{0.33in}
27 | \setlength{\textwidth}{7.00in}
28 | 
29 | \setlength{\topmargin}{0.0in}
30 | 
31 | \setlength{\headheight}{0.0in}
32 | 
33 | \setlength{\headsep}{0.0in}
34 | 
35 | \addtolength{\oddsidemargin}{-0.25in}
36 | \addtolength{\evensidemargin}{-0.25in}
37 | 
38 | % Usenix wants no page numbers for camera-ready papers, so that they can
39 | % number them themselves.  But submitted papers should have page numbers
40 | % for the reviewers' convenience.
41 | % 
42 | %
43 | % \pagestyle{empty}
44 | 
45 | %
46 | % Usenix titles are in 14-point bold type, with no date, and with no
47 | % change in the empty page headers.  The whole author section is 12 point
48 | % italic--- you must use {\rm } around the actual author names to get
49 | % them in roman.
50 | %
51 | \def\maketitle{\par
52 |  \begingroup
53 |    \renewcommand\thefootnote{\fnsymbol{footnote}}%
54 |    \def\@makefnmark{\hbox to\z@{$\m@th^{\@thefnmark}$\hss}}%
55 |     \long\def\@makefntext##1{\parindent 1em\noindent
56 |             \hbox to1.8em{\hss$\m@th^{\@thefnmark}$}##1}%
57 |    \if@twocolumn
58 |      \twocolumn[\@maketitle]%
59 |      \else \newpage
60 |      \global\@topnum\z@
61 |      \@maketitle \fi\@thanks
62 |  \endgroup
63 |  \setcounter{footnote}{0}%
64 |  \let\maketitle\relax
65 |  \let\@maketitle\relax
66 |  \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
67 | 
68 | \def\@maketitle{\newpage
69 |  \vbox to 2.5in{
70 |  \vspace*{\fill}
71 |  \vskip 2em
72 |  \begin{center}%
73 |   {\Large\bf \@title \par}%
74 |   \vskip 0.375in minus 0.300in
75 |   {\large\it
76 |    \lineskip .5em
77 |    \begin{tabular}[t]{c}\@author
78 |    \end{tabular}\par}%
79 |  \end{center}%
80 |  \par
81 |  \vspace*{\fill}
82 | % \vskip 1.5em
83 |  }
84 | }
85 | 
86 | %
87 | % The abstract is preceded by a 12-pt bold centered heading
88 | \def\abstract{\begin{center}%
89 | {\large\bf \abstractname\vspace{-.5em}\vspace{\z@}}%
90 | \end{center}}
91 | \def\endabstract{}
92 | 
93 | %
94 | % Main section titles are 12-pt bold.  Others can be same or smaller.
95 | %
96 | \def\section{\@startsection {section}{1}{\z@}{-3.5ex plus-1ex minus
97 |     -.2ex}{2.3ex plus.2ex}{\reset@font\large\bf}}
98 | 


--------------------------------------------------------------------------------
/tex/usenix2019.tex:
--------------------------------------------------------------------------------
  1 | % TEMPLATE for Usenix papers, specifically to meet requirements of
  2 | %  USENIX '05
  3 | % originally a template for producing IEEE-format articles using LaTeX.
  4 | %   written by Matthew Ward, CS Department, Worcester Polytechnic Institute.
  5 | % adapted by David Beazley for his excellent SWIG paper in Proceedings,
  6 | %   Tcl 96
  7 | % turned into a smartass generic template by De Clarke, with thanks to
  8 | %   both the above pioneers
  9 | % use at your own risk.  Complaints to /dev/null.
 10 | % make it two column with no page numbering, default is 10 point
 11 | 
 12 | % Munged by Fred Douglis <douglis@research.att.com> 10/97 to separate
 13 | % the .sty file from the LaTeX source template, so that people can
 14 | % more easily include the .sty file into an existing document.  Also
 15 | % changed to more closely follow the style guidelines as represented
 16 | % by the Word sample file. 
 17 | 
 18 | % Note that since 2010, USENIX does not require endnotes. If you want
 19 | % foot of page notes, don't include the endnotes package in the 
 20 | % usepackage command, below.
 21 | 
 22 | % This version uses the latex2e styles, not the very ancient 2.09 stuff.
 23 | \documentclass[letterpaper,twocolumn,10pt]{article}
 24 | \usepackage{usenix2019,epsfig,endnotes}
 25 | \begin{document}
 26 | 
 27 | %don't want date printed
 28 | \date{}
 29 | 
 30 | %make title bold and 14 pt font (Latex default is non-bold, 16 pt)
 31 | \title{\Large \bf Wonderful : A Terrific Application and Fascinating Paper}
 32 | 
 33 | %for single author (just remove % characters)
 34 | \author{
 35 | {\rm Your N.\ Here}\\
 36 | Your Institution
 37 | \and
 38 | {\rm Second Name}\\
 39 | Second Institution
 40 | % copy the following lines to add more authors
 41 | % \and
 42 | % {\rm Name}\\
 43 | %Name Institution
 44 | } % end author
 45 | 
 46 | \maketitle
 47 | 
 48 | % Use the following at camera-ready time to suppress page numbers.
 49 | % Comment it out when you first submit the paper for review.
 50 | \thispagestyle{empty}
 51 | 
 52 | 
 53 | \subsection*{Abstract}
 54 | Your Abstract Text Goes Here.  Just a few facts.
 55 | Whet our appetites.
 56 | 
 57 | \section{Introduction}
 58 | 
 59 | A paragraph of text goes here.  Lots of text.  Plenty of interesting
 60 | text. \\
 61 | 
 62 | More fascinating text. Features\endnote{Remember to use endnotes, not footnotes!} galore, plethora of promises.\\
 63 | 
 64 | \section{This is Another Section}
 65 | 
 66 | Some embedded literal typset code might 
 67 | look like the following :
 68 | 
 69 | {\tt \small
 70 | \begin{verbatim}
 71 | int wrap_fact(ClientData clientData,
 72 |               Tcl_Interp *interp,
 73 |               int argc, char *argv[]) {
 74 |     int result;
 75 |     int arg0;
 76 |     if (argc != 2) {
 77 |         interp->result = "wrong # args";
 78 |         return TCL_ERROR;
 79 |     }
 80 |     arg0 = atoi(argv[1]);
 81 |     result = fact(arg0);
 82 |     sprintf(interp->result,"%d",result);
 83 |     return TCL_OK;
 84 | }
 85 | \end{verbatim}
 86 | }
 87 | 
 88 | Now we're going to cite somebody.  Watch for the cite tag.
 89 | Here it comes~\cite{Chaum1981,Diffie1976}.  The tilde character (\~{})
 90 | in the source means a non-breaking space.  This way, your reference will
 91 | always be attached to the word that preceded it, instead of going to the
 92 | next line.
 93 | 
 94 | \section{This Section has SubSections}
 95 | \subsection{First SubSection}
 96 | 
 97 | Here's a typical figure reference.  The figure is centered at the
 98 | top of the column.  It's scaled.  It's explicitly placed.  You'll
 99 | have to tweak the numbers to get what you want.\\
100 | 
101 | % you can also use the wonderful epsfig package...
102 | \begin{figure}[t]
103 | \begin{center}
104 | \begin{picture}(300,150)(0,200)
105 | \put(-15,-30){\special{psfile = fig1.ps hscale = 50 vscale = 50}}
106 | \end{picture}\\
107 | \end{center}
108 | \caption{Wonderful Flowchart}
109 | \end{figure}
110 | 
111 | This text came after the figure, so we'll casually refer to Figure 1
112 | as we go on our merry way.
113 | 
114 | \subsection{New Subsection}
115 | 
116 | It can get tricky typesetting Tcl and C code in LaTeX because they share
117 | a lot of mystical feelings about certain magic characters.  You
118 | will have to do a lot of escaping to typeset curly braces and percent
119 | signs, for example, like this:
120 | ``The {\tt \%module} directive
121 | sets the name of the initialization function.  This is optional, but is
122 | recommended if building a Tcl 7.5 module.
123 | Everything inside the {\tt \%\{, \%\}}
124 | block is copied directly into the output. allowing the inclusion of
125 | header files and additional C code." \\
126 | 
127 | Sometimes you want to really call attention to a piece of text.  You
128 | can center it in the column like this:
129 | \begin{center}
130 | {\tt \_1008e614\_Vector\_p}
131 | \end{center}
132 | and people will really notice it.\\
133 | 
134 | \noindent
135 | The noindent at the start of this paragraph makes it clear that it's
136 | a continuation of the preceding text, not a new para in its own right.
137 | 
138 | 
139 | Now this is an ingenious way to get a forced space.
140 | {\tt Real~$*$} and {\tt double~$*$} are equivalent. 
141 | 
142 | Now here is another way to call attention to a line of code, but instead
143 | of centering it, we noindent and bold it.\\
144 | 
145 | \noindent
146 | {\bf \tt size\_t : fread ptr size nobj stream } \\
147 | 
148 | And here we have made an indented para like a definition tag (dt)
149 | in HTML.  You don't need a surrounding list macro pair.
150 | \begin{itemize}
151 | \item[]  {\tt fread} reads from {\tt stream} into the array {\tt ptr} at
152 | most {\tt nobj} objects of size {\tt size}.   {\tt fread} returns
153 | the number of objects read. 
154 | \end{itemize}
155 | This concludes the definitions tag.
156 | 
157 | \subsection{How to Build Your Paper}
158 | 
159 | You have to run {\tt latex} once to prepare your references for
160 | munging.  Then run {\tt bibtex} to build your bibliography metadata.
161 | Then run {\tt latex} twice to ensure all references have been resolved.
162 | If your source file is called {\tt usenixTemplate.tex} and your {\tt
163 |   bibtex} file is called {\tt usenixTemplate.bib}, here's what you do:
164 | {\tt \small
165 | \begin{verbatim}
166 | latex usenixTemplate
167 | bibtex usenixTemplate
168 | latex usenixTemplate
169 | latex usenixTemplate
170 | \end{verbatim}
171 | }
172 | 
173 | 
174 | \subsection{Last SubSection}
175 | 
176 | Well, it's getting boring isn't it.  This is the last subsection
177 | before we wrap it up.
178 | 
179 | \section{Acknowledgments}
180 | 
181 | A polite author always includes acknowledgments.  Thank everyone,
182 | especially those who funded the work. 
183 | 
184 | \section{Availability}
185 | 
186 | It's great when this section says that MyWonderfulApp is free software, 
187 | available via anonymous FTP from
188 | 
189 | \begin{center}
190 | {\tt ftp.site.dom/pub/myname/Wonderful}\\
191 | \end{center}
192 | 
193 | Also, it's even greater when you can write that information is also 
194 | available on the Wonderful homepage at 
195 | 
196 | \begin{center}
197 | {\tt http://www.site.dom/\~{}myname/SWIG}
198 | \end{center}
199 | 
200 | Now we get serious and fill in those references.  Remember you will
201 | have to run latex twice on the document in order to resolve those
202 | cite tags you met earlier.  This is where they get resolved.
203 | We've preserved some real ones in addition to the template-speak.
204 | After the bibliography you are DONE.
205 | 
206 | {\footnotesize \bibliographystyle{acm}
207 | \bibliography{../common/bibliography}}
208 | 
209 | 
210 | \theendnotes
211 | 
212 | \end{document}
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------