├── setup-pmfs.sh ├── remount-pmfs.sh ├── intel-setup-pmfs.sh ├── intel-remount-pmfs.sh ├── Makefile ├── xip.h ├── Kconfig ├── pmfs_test.c ├── pmfs_stats.c ├── wprotect.c ├── README.md ├── symlink.c ├── journal.h ├── wprotect.h ├── ioctl.c ├── balloc.c ├── dir.c ├── file.c ├── pmfs_def.h ├── bbuild.c ├── xip.c ├── pmfs.h ├── namei.c ├── journal.c └── super.c /setup-pmfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | umount /mnt/ramdisk 4 | umount /mnt/scratch 5 | rmmod pmfs 6 | insmod pmfs.ko measure_timing=0 7 | 8 | sleep 1 9 | 10 | mount -t pmfs -o init /dev/pmem0 /mnt/ramdisk 11 | mount -t pmfs -o init /dev/pmem1 /mnt/scratch 12 | 13 | -------------------------------------------------------------------------------- /remount-pmfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | umount /mnt/ramdisk 4 | rmmod pmfs 5 | insmod pmfs.ko measure_timing=0 6 | 7 | sleep 1 8 | 9 | mount -t pmfs /dev/pmem0 /mnt/ramdisk 10 | 11 | #cp test1 /mnt/ramdisk/ 12 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct 13 | -------------------------------------------------------------------------------- /intel-setup-pmfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | umount /mnt/ramdisk 4 | rmmod pmfs 5 | rmmod pmem 6 | insmod pmfs.ko measure_timing=0 7 | 8 | sleep 1 9 | 10 | mount -t pmfs -o physaddr=0x10000000000,init=64G none /mnt/ramdisk 11 | 12 | #cp test1 /mnt/ramdisk/ 13 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct 14 | -------------------------------------------------------------------------------- /intel-remount-pmfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | umount /mnt/ramdisk 4 | rmmod pmfs 5 | insmod pmfs.ko measure_timing=0 6 | 7 | sleep 1 8 | 9 | #mount -t pmfs -o physaddr=0x100000000 none /mnt/ramdisk 10 | mount -t pmfs -o physaddr=0x10000000000 none /mnt/ramdisk 11 | 12 | #cp test1 /mnt/ramdisk/ 13 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Makefile for the linux pmfs-filesystem routines. 3 | # 4 | 5 | obj-m += pmfs.o 6 | 7 | pmfs-y := bbuild.o balloc.o dir.o file.o inode.o namei.o super.o symlink.o ioctl.o pmfs_stats.o journal.o xip.o wprotect.o 8 | 9 | all: 10 | make -C /lib/modules/$(shell uname -r)/build M=`pwd` 11 | 12 | clean: 13 | make -C /lib/modules/$(shell uname -r)/build M=`pwd` clean 14 | -------------------------------------------------------------------------------- /xip.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * XIP operations. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * This file is licensed under the terms of the GNU General Public 9 | * License version 2. This program is licensed "as is" without any 10 | * warranty of any kind, whether express or implied. 11 | */ 12 | 13 | int pmfs_get_xip_mem(struct address_space *, pgoff_t, int, void **, 14 | unsigned long *); 15 | ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf, size_t len, 16 | loff_t *ppos); 17 | ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf, 18 | size_t len, loff_t *ppos); 19 | int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma); 20 | 21 | static inline int pmfs_use_xip(struct super_block *sb) 22 | { 23 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 24 | 25 | return sbi->s_mount_opt & PMFS_MOUNT_XIP; 26 | } 27 | 28 | #define mapping_is_xip(map) (map->a_ops->get_xip_mem) 29 | -------------------------------------------------------------------------------- /Kconfig: -------------------------------------------------------------------------------- 1 | config PMFS 2 | tristate "Persistent and Protected PM file system support" 3 | depends on HAS_IOMEM 4 | select CRC16 5 | help 6 | If your system has a block of fast (comparable in access speed to 7 | system memory) and non-volatile byte-addressable memory and you wish to 8 | mount a light-weight, full-featured, and space-efficient filesystem over 9 | it, say Y here, and read . 10 | 11 | To compile this as a module, choose M here: the module will be 12 | called pmfs. 13 | 14 | config PMFS_XIP 15 | bool "Execute-in-place in PMFS" 16 | depends on PMFS && BLOCK 17 | help 18 | Say Y here to enable XIP feature of PMFS. 19 | 20 | config PMFS_WRITE_PROTECT 21 | bool "PMFS write protection" 22 | depends on PMFS && MMU && HAVE_SET_MEMORY_RO 23 | default y 24 | help 25 | Say Y here to enable the write protect feature of PMFS. 26 | 27 | config PMFS_TEST 28 | boolean 29 | depends on PMFS 30 | 31 | config PMFS_TEST_MODULE 32 | tristate "PMFS Test" 33 | depends on PMFS && PMFS_WRITE_PROTECT && m 34 | select PMFS_TEST 35 | help 36 | Say Y here to build a simple module to test the protection of 37 | PMFS. The module will be called pmfs_test. 38 | -------------------------------------------------------------------------------- /pmfs_test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * pmfs test module. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "pmfs.h" 20 | 21 | int __init test_pmfs_write(void) 22 | { 23 | struct pmfs_super_block *psb; 24 | 25 | psb = get_pmfs_super(); 26 | if (!psb) { 27 | printk(KERN_ERR 28 | "%s: PMFS super block not found (not mounted?)\n", 29 | __func__); 30 | return 1; 31 | } 32 | 33 | /* 34 | * Attempt an unprotected clear of checksum information in the 35 | * superblock, this should cause a kernel page protection fault. 36 | */ 37 | printk("%s: writing to kernel VA %p\n", __func__, psb); 38 | psb->s_sum = 0; 39 | 40 | return 0; 41 | } 42 | 43 | void test_pmfs_write_cleanup(void) 44 | { 45 | } 46 | 47 | /* Module information */ 48 | MODULE_LICENSE("GPL"); 49 | module_init(test_pmfs_write); 50 | module_exit(test_pmfs_write_cleanup); 51 | -------------------------------------------------------------------------------- /pmfs_stats.c: -------------------------------------------------------------------------------- 1 | #include "pmfs.h" 2 | 3 | const char *Timingstring[TIMING_NUM] = 4 | { 5 | "create", 6 | "unlink", 7 | "readdir", 8 | "xip_read", 9 | "xip_write", 10 | "xip_write_fast", 11 | "internal_write", 12 | "memcpy_read", 13 | "memcpy_write", 14 | "alloc_blocks", 15 | "new_trans", 16 | "add_logentry", 17 | "commit_trans", 18 | "mmap_fault", 19 | "fsync", 20 | "free_tree", 21 | "evict_inode", 22 | "recovery", 23 | }; 24 | 25 | unsigned long long Timingstats[TIMING_NUM]; 26 | u64 Countstats[TIMING_NUM]; 27 | 28 | atomic64_t fsync_pages = ATOMIC_INIT(0); 29 | 30 | void pmfs_print_IO_stats(void) 31 | { 32 | printk("=========== PMFS I/O stats ===========\n"); 33 | printk("Fsync %ld pages\n", atomic64_read(&fsync_pages)); 34 | } 35 | 36 | void pmfs_print_timing_stats(void) 37 | { 38 | int i; 39 | 40 | printk("======== PMFS kernel timing stats ========\n"); 41 | for (i = 0; i < TIMING_NUM; i++) { 42 | if (measure_timing || Timingstats[i]) { 43 | printk("%s: count %llu, timing %llu, average %llu\n", 44 | Timingstring[i], 45 | Countstats[i], 46 | Timingstats[i], 47 | Countstats[i] ? 48 | Timingstats[i] / Countstats[i] : 0); 49 | } else { 50 | printk("%s: count %llu\n", 51 | Timingstring[i], 52 | Countstats[i]); 53 | } 54 | } 55 | 56 | pmfs_print_IO_stats(); 57 | } 58 | 59 | void pmfs_clear_stats(void) 60 | { 61 | int i; 62 | 63 | printk("======== Clear PMFS kernel timing stats ========\n"); 64 | for (i = 0; i < TIMING_NUM; i++) { 65 | Countstats[i] = 0; 66 | Timingstats[i] = 0; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /wprotect.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Write protection for the filesystem pages. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "pmfs.h" 21 | 22 | static inline void wprotect_disable(void) 23 | { 24 | unsigned long cr0_val; 25 | 26 | cr0_val = read_cr0(); 27 | cr0_val &= (~X86_CR0_WP); 28 | write_cr0(cr0_val); 29 | } 30 | 31 | static inline void wprotect_enable(void) 32 | { 33 | unsigned long cr0_val; 34 | 35 | cr0_val = read_cr0(); 36 | cr0_val |= X86_CR0_WP; 37 | write_cr0(cr0_val); 38 | } 39 | 40 | /* FIXME: Assumes that we are always called in the right order. 41 | * pmfs_writeable(vaddr, size, 1); 42 | * pmfs_writeable(vaddr, size, 0); 43 | */ 44 | int pmfs_writeable(void *vaddr, unsigned long size, int rw) 45 | { 46 | static unsigned long flags; 47 | if (rw) { 48 | local_irq_save(flags); 49 | wprotect_disable(); 50 | } else { 51 | wprotect_enable(); 52 | local_irq_restore(flags); 53 | } 54 | return 0; 55 | } 56 | 57 | int pmfs_xip_mem_protect(struct super_block *sb, void *vaddr, 58 | unsigned long size, int rw) 59 | { 60 | if (!pmfs_is_wprotected(sb)) 61 | return 0; 62 | return pmfs_writeable(vaddr, size, rw); 63 | } 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Porting PMFS to the latest Linux kernel 2 | 3 | ## Introduction 4 | 5 | PMFS is a file system for persistent memory, developed by Intel. 6 | For more details about PMFS, please check the git repository: 7 | 8 | https://github.com/linux-pmfs/pmfs 9 | 10 | This project ports PMFS to the latest Linux kernel so developers can compare PMFS to other file systems on the new kernel. 11 | 12 | ## Building PMFS 13 | The master branch works on the 4.15 version of x86-64 Linux kernel. 14 | 15 | To build PMFS, simply run a 16 | 17 | ~~~ 18 | #make 19 | ~~~ 20 | 21 | command. 22 | 23 | ## Running PMFS 24 | PMFS runs on a physically contiguous memory region that is not used by the Linux kernel, and relies on the kernel NVDIMM support. 25 | 26 | To run PMFS, first build up your kernel with NVDIMM support enabled (`CONFIG_BLK_DEV_PMEM`), and then you can 27 | reserve the memory space by booting the kernel with `memmap` command line option. 28 | 29 | For instance, adding `memmap=16G!8G` to the kernel boot parameters will reserve 16GB memory starting from 8GB address, and the kernel will create a `pmem0` block device under the `/dev` directory. 30 | 31 | After the OS has booted, you can initialize a PMFS instance with the following commands: 32 | 33 | 34 | ~~~ 35 | #insmod pmfs.ko 36 | #mount -t pmfs -o init /dev/pmem0 /mnt/ramdisk 37 | ~~~ 38 | 39 | The above commands create a PMFS instance on pmem0 device, and mount on `/mnt/ramdisk`. 40 | 41 | To recover an existing PMFS instance, mount PMFS without the init option, for example: 42 | 43 | ~~~ 44 | #mount -t pmfs /dev/pmem0 /mnt/ramdisk 45 | ~~~ 46 | 47 | There are two scripts provided in the source code, `setup-pmfs.sh` and `remount-pmfs.sh` to help setup PMFS. 48 | 49 | ## Current limitations 50 | 51 | * PMFS only works on x86-64 kernels. 52 | * PMFS does not currently support extended attributes or ACL. 53 | * PMFS requires the underlying block device to support DAX (Direct Access) feature. 54 | * This project cuts some features of the original PMFS, such as memory protection and huge mmap support. If you need these features, please turn to the original PMFS. 55 | -------------------------------------------------------------------------------- /symlink.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Symlink operations 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | 16 | #include 17 | #include 18 | #include "pmfs.h" 19 | 20 | int pmfs_block_symlink(struct inode *inode, const char *symname, int len) 21 | { 22 | struct super_block *sb = inode->i_sb; 23 | u64 block; 24 | char *blockp; 25 | int err; 26 | 27 | err = pmfs_alloc_blocks(NULL, inode, 0, 1, false); 28 | if (err) 29 | return err; 30 | 31 | block = pmfs_find_data_block(inode, 0); 32 | blockp = pmfs_get_block(sb, block); 33 | 34 | pmfs_memunlock_block(sb, blockp); 35 | memcpy(blockp, symname, len); 36 | blockp[len] = '\0'; 37 | pmfs_memlock_block(sb, blockp); 38 | pmfs_flush_buffer(blockp, len+1, false); 39 | return 0; 40 | } 41 | 42 | /* FIXME: Temporary workaround */ 43 | static int pmfs_readlink_copy(char __user *buffer, int buflen, const char *link) 44 | { 45 | int len = PTR_ERR(link); 46 | if (IS_ERR(link)) 47 | goto out; 48 | 49 | len = strlen(link); 50 | if (len > (unsigned) buflen) 51 | len = buflen; 52 | if (copy_to_user(buffer, link, len)) 53 | len = -EFAULT; 54 | out: 55 | return len; 56 | } 57 | 58 | static int pmfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) 59 | { 60 | struct inode *inode = dentry->d_inode; 61 | struct super_block *sb = inode->i_sb; 62 | u64 block; 63 | char *blockp; 64 | 65 | block = pmfs_find_data_block(inode, 0); 66 | blockp = pmfs_get_block(sb, block); 67 | return pmfs_readlink_copy(buffer, buflen, blockp); 68 | } 69 | 70 | static const char *pmfs_get_link(struct dentry *dentry, struct inode *inode, 71 | struct delayed_call *done) 72 | { 73 | struct super_block *sb = inode->i_sb; 74 | off_t block; 75 | char *blockp; 76 | 77 | block = pmfs_find_data_block(inode, 0); 78 | blockp = pmfs_get_block(sb, block); 79 | return blockp; 80 | } 81 | 82 | const struct inode_operations pmfs_symlink_inode_operations = { 83 | .readlink = pmfs_readlink, 84 | .get_link = pmfs_get_link, 85 | .setattr = pmfs_notify_change, 86 | }; 87 | -------------------------------------------------------------------------------- /journal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Persistent Memory File System 3 | * Copyright (c) 2012-2013, Intel Corporation. 4 | * 5 | * This program is free software; you can redistribute it and/or modify it 6 | * under the terms and conditions of the GNU General Public License, 7 | * version 2, as published by the Free Software Foundation. 8 | * 9 | * This program is distributed in the hope it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along with 15 | * this program; if not, write to the Free Software Foundation, Inc., 16 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 17 | */ 18 | #ifndef __PMFS_JOURNAL_H__ 19 | #define __PMFS_JOURNAL_H__ 20 | #include 21 | 22 | /* default pmfs journal size 4MB */ 23 | #define PMFS_DEFAULT_JOURNAL_SIZE (4 << 20) 24 | /* minimum pmfs journal size 64KB */ 25 | #define PMFS_MINIMUM_JOURNAL_SIZE (1 << 16) 26 | 27 | #define CACHELINE_SIZE (64) 28 | #define CLINE_SHIFT (6) 29 | #define CACHELINE_MASK (~(CACHELINE_SIZE - 1)) 30 | #define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK) 31 | 32 | #define LOGENTRY_SIZE CACHELINE_SIZE 33 | #define LESIZE_SHIFT CLINE_SHIFT 34 | 35 | #define MAX_INODE_LENTRIES (2) 36 | #define MAX_SB_LENTRIES (2) 37 | /* 1 le for dir entry and 1 le for potentially allocating a new dir block */ 38 | #define MAX_DIRENTRY_LENTRIES (2) 39 | /* 2 le for adding or removing the inode from truncate list. used to log 40 | * potential changes to inode table's i_next_truncate and i_sum */ 41 | #define MAX_TRUNCATE_LENTRIES (2) 42 | #define MAX_DATA_PER_LENTRY 48 43 | /* blocksize * max_btree_height */ 44 | #define MAX_METABLOCK_LENTRIES \ 45 | ((PMFS_DEF_BLOCK_SIZE_4K * 3)/MAX_DATA_PER_LENTRY) 46 | 47 | #define MAX_PTRS_PER_LENTRY (MAX_DATA_PER_LENTRY / sizeof(u64)) 48 | 49 | #define TRANS_RUNNING 1 50 | #define TRANS_COMMITTED 2 51 | #define TRANS_ABORTED 3 52 | 53 | #define LE_DATA 0 54 | #define LE_START 1 55 | #define LE_COMMIT 2 56 | #define LE_ABORT 4 57 | 58 | #define MAX_GEN_ID ((uint16_t)-1) 59 | 60 | /* persistent data structure to describe a single log-entry */ 61 | /* every log entry is max CACHELINE_SIZE bytes in size */ 62 | typedef struct { 63 | __le64 addr_offset; 64 | __le32 transaction_id; 65 | __le16 gen_id; 66 | u8 type; /* normal, commit, or abort */ 67 | u8 size; 68 | char data[48]; 69 | } pmfs_logentry_t; 70 | 71 | /* volatile data structure to describe a transaction */ 72 | typedef struct pmfs_transaction { 73 | u32 transaction_id; 74 | u16 num_entries; 75 | u16 num_used; 76 | u16 gen_id; 77 | u16 status; 78 | pmfs_journal_t *t_journal; 79 | pmfs_logentry_t *start_addr; 80 | struct pmfs_transaction *parent; 81 | } pmfs_transaction_t; 82 | 83 | extern pmfs_transaction_t *pmfs_alloc_transaction(void); 84 | extern void pmfs_free_transaction(pmfs_transaction_t *trans); 85 | 86 | extern int pmfs_journal_soft_init(struct super_block *sb); 87 | extern int pmfs_journal_hard_init(struct super_block *sb, 88 | uint64_t base, uint32_t size); 89 | extern int pmfs_journal_uninit(struct super_block *sb); 90 | extern pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb, 91 | int nclines); 92 | extern pmfs_transaction_t *pmfs_current_transaction(void); 93 | extern int pmfs_add_logentry(struct super_block *sb, 94 | pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type); 95 | extern int pmfs_commit_transaction(struct super_block *sb, 96 | pmfs_transaction_t *trans); 97 | extern int pmfs_abort_transaction(struct super_block *sb, 98 | pmfs_transaction_t *trans); 99 | extern int pmfs_recover_journal(struct super_block *sb); 100 | 101 | #endif /* __PMFS_JOURNAL_H__ */ 102 | -------------------------------------------------------------------------------- /wprotect.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Memory protection definitions for the PMFS filesystem. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2010-2011 Marco Stornelli 8 | * This file is licensed under the terms of the GNU General Public 9 | * License version 2. This program is licensed "as is" without any 10 | * warranty of any kind, whether express or implied. 11 | */ 12 | 13 | #ifndef __WPROTECT_H 14 | #define __WPROTECT_H 15 | 16 | #include 17 | #include "pmfs_def.h" 18 | 19 | /* pmfs_memunlock_super() before calling! */ 20 | static inline void pmfs_sync_super(struct pmfs_super_block *ps) 21 | { 22 | u16 crc = 0; 23 | 24 | ps->s_wtime = cpu_to_le32(get_seconds()); 25 | ps->s_sum = 0; 26 | crc = crc16(~0, (__u8 *)ps + sizeof(__le16), 27 | PMFS_SB_STATIC_SIZE(ps) - sizeof(__le16)); 28 | ps->s_sum = cpu_to_le16(crc); 29 | /* Keep sync redundant super block */ 30 | memcpy((void *)ps + PMFS_SB_SIZE, (void *)ps, 31 | sizeof(struct pmfs_super_block)); 32 | } 33 | 34 | #if 0 35 | /* pmfs_memunlock_inode() before calling! */ 36 | static inline void pmfs_sync_inode(struct pmfs_inode *pi) 37 | { 38 | u16 crc = 0; 39 | 40 | pi->i_sum = 0; 41 | crc = crc16(~0, (__u8 *)pi + sizeof(__le16), PMFS_INODE_SIZE - 42 | sizeof(__le16)); 43 | pi->i_sum = cpu_to_le16(crc); 44 | } 45 | #endif 46 | 47 | extern int pmfs_writeable(void *vaddr, unsigned long size, int rw); 48 | extern int pmfs_xip_mem_protect(struct super_block *sb, 49 | void *vaddr, unsigned long size, int rw); 50 | 51 | static inline int pmfs_is_protected(struct super_block *sb) 52 | { 53 | struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info; 54 | 55 | return sbi->s_mount_opt & PMFS_MOUNT_PROTECT; 56 | } 57 | 58 | static inline int pmfs_is_wprotected(struct super_block *sb) 59 | { 60 | return pmfs_is_protected(sb); 61 | } 62 | 63 | static inline void 64 | __pmfs_memunlock_range(void *p, unsigned long len) 65 | { 66 | /* 67 | * NOTE: Ideally we should lock all the kernel to be memory safe 68 | * and avoid to write in the protected memory, 69 | * obviously it's not possible, so we only serialize 70 | * the operations at fs level. We can't disable the interrupts 71 | * because we could have a deadlock in this path. 72 | */ 73 | pmfs_writeable(p, len, 1); 74 | } 75 | 76 | static inline void 77 | __pmfs_memlock_range(void *p, unsigned long len) 78 | { 79 | pmfs_writeable(p, len, 0); 80 | } 81 | 82 | static inline void pmfs_memunlock_range(struct super_block *sb, void *p, 83 | unsigned long len) 84 | { 85 | if (pmfs_is_protected(sb)) 86 | __pmfs_memunlock_range(p, len); 87 | } 88 | 89 | static inline void pmfs_memlock_range(struct super_block *sb, void *p, 90 | unsigned long len) 91 | { 92 | if (pmfs_is_protected(sb)) 93 | __pmfs_memlock_range(p, len); 94 | } 95 | 96 | static inline void pmfs_memunlock_super(struct super_block *sb, 97 | struct pmfs_super_block *ps) 98 | { 99 | if (pmfs_is_protected(sb)) 100 | __pmfs_memunlock_range(ps, PMFS_SB_SIZE); 101 | } 102 | 103 | static inline void pmfs_memlock_super(struct super_block *sb, 104 | struct pmfs_super_block *ps) 105 | { 106 | pmfs_sync_super(ps); 107 | if (pmfs_is_protected(sb)) 108 | __pmfs_memlock_range(ps, PMFS_SB_SIZE); 109 | } 110 | 111 | static inline void pmfs_memunlock_inode(struct super_block *sb, 112 | struct pmfs_inode *pi) 113 | { 114 | if (pmfs_is_protected(sb)) 115 | __pmfs_memunlock_range(pi, PMFS_SB_SIZE); 116 | } 117 | 118 | static inline void pmfs_memlock_inode(struct super_block *sb, 119 | struct pmfs_inode *pi) 120 | { 121 | /* pmfs_sync_inode(pi); */ 122 | if (pmfs_is_protected(sb)) 123 | __pmfs_memlock_range(pi, PMFS_SB_SIZE); 124 | } 125 | 126 | static inline void pmfs_memunlock_block(struct super_block *sb, void *bp) 127 | { 128 | if (pmfs_is_protected(sb)) 129 | __pmfs_memunlock_range(bp, sb->s_blocksize); 130 | } 131 | 132 | static inline void pmfs_memlock_block(struct super_block *sb, void *bp) 133 | { 134 | if (pmfs_is_protected(sb)) 135 | __pmfs_memlock_range(bp, sb->s_blocksize); 136 | } 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /ioctl.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Ioctl operations. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2010-2011 Marco Stornelli 8 | * 9 | * This file is licensed under the terms of the GNU General Public 10 | * License version 2. This program is licensed "as is" without any 11 | * warranty of any kind, whether express or implied. 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "pmfs.h" 20 | 21 | #define FS_PMFS_FSYNC 0xBCD0000E 22 | 23 | struct sync_range 24 | { 25 | off_t offset; 26 | size_t length; 27 | }; 28 | 29 | long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 30 | { 31 | struct address_space *mapping = filp->f_mapping; 32 | struct inode *inode = mapping->host; 33 | struct pmfs_inode *pi; 34 | struct super_block *sb = inode->i_sb; 35 | unsigned int flags; 36 | int ret; 37 | pmfs_transaction_t *trans; 38 | 39 | pi = pmfs_get_inode(sb, inode->i_ino); 40 | if (!pi) 41 | return -EACCES; 42 | 43 | switch (cmd) { 44 | case FS_IOC_GETFLAGS: 45 | flags = le32_to_cpu(pi->i_flags) & PMFS_FL_USER_VISIBLE; 46 | return put_user(flags, (int __user *)arg); 47 | case FS_IOC_SETFLAGS: { 48 | unsigned int oldflags; 49 | 50 | ret = mnt_want_write_file(filp); 51 | if (ret) 52 | return ret; 53 | 54 | if (!inode_owner_or_capable(inode)) { 55 | ret = -EPERM; 56 | goto flags_out; 57 | } 58 | 59 | if (get_user(flags, (int __user *)arg)) { 60 | ret = -EFAULT; 61 | goto flags_out; 62 | } 63 | 64 | inode_lock(inode); 65 | oldflags = le32_to_cpu(pi->i_flags); 66 | 67 | if ((flags ^ oldflags) & 68 | (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 69 | if (!capable(CAP_LINUX_IMMUTABLE)) { 70 | inode_unlock(inode); 71 | ret = -EPERM; 72 | goto flags_out; 73 | } 74 | } 75 | 76 | if (!S_ISDIR(inode->i_mode)) 77 | flags &= ~FS_DIRSYNC_FL; 78 | 79 | flags = flags & FS_FL_USER_MODIFIABLE; 80 | flags |= oldflags & ~FS_FL_USER_MODIFIABLE; 81 | inode->i_ctime = current_time(inode); 82 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES); 83 | if (IS_ERR(trans)) { 84 | ret = PTR_ERR(trans); 85 | goto out; 86 | } 87 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 88 | pmfs_memunlock_inode(sb, pi); 89 | pi->i_flags = cpu_to_le32(flags); 90 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 91 | pmfs_set_inode_flags(inode, pi); 92 | pmfs_memlock_inode(sb, pi); 93 | pmfs_commit_transaction(sb, trans); 94 | out: 95 | inode_unlock(inode); 96 | flags_out: 97 | mnt_drop_write_file(filp); 98 | return ret; 99 | } 100 | case FS_IOC_GETVERSION: 101 | return put_user(inode->i_generation, (int __user *)arg); 102 | case FS_IOC_SETVERSION: { 103 | __u32 generation; 104 | if (!inode_owner_or_capable(inode)) 105 | return -EPERM; 106 | ret = mnt_want_write_file(filp); 107 | if (ret) 108 | return ret; 109 | if (get_user(generation, (int __user *)arg)) { 110 | ret = -EFAULT; 111 | goto setversion_out; 112 | } 113 | inode_lock(inode); 114 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES); 115 | if (IS_ERR(trans)) { 116 | ret = PTR_ERR(trans); 117 | goto out; 118 | } 119 | pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA); 120 | inode->i_ctime = current_time(inode); 121 | inode->i_generation = generation; 122 | pmfs_memunlock_inode(sb, pi); 123 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 124 | pi->i_generation = cpu_to_le32(inode->i_generation); 125 | pmfs_memlock_inode(sb, pi); 126 | pmfs_commit_transaction(sb, trans); 127 | inode_unlock(inode); 128 | setversion_out: 129 | mnt_drop_write_file(filp); 130 | return ret; 131 | } 132 | case FS_PMFS_FSYNC: { 133 | struct sync_range packet; 134 | copy_from_user(&packet, (void *)arg, sizeof(struct sync_range)); 135 | pmfs_fsync(filp, packet.offset, packet.offset + packet.length, 1); 136 | return 0; 137 | } 138 | case PMFS_PRINT_TIMING: { 139 | pmfs_print_timing_stats(); 140 | return 0; 141 | } 142 | case PMFS_CLEAR_STATS: { 143 | pmfs_clear_stats(); 144 | return 0; 145 | } 146 | default: 147 | return -ENOTTY; 148 | } 149 | } 150 | 151 | #ifdef CONFIG_COMPAT 152 | long pmfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 153 | { 154 | switch (cmd) { 155 | case FS_IOC32_GETFLAGS: 156 | cmd = FS_IOC_GETFLAGS; 157 | break; 158 | case FS_IOC32_SETFLAGS: 159 | cmd = FS_IOC_SETFLAGS; 160 | break; 161 | case FS_IOC32_GETVERSION: 162 | cmd = FS_IOC_GETVERSION; 163 | break; 164 | case FS_IOC32_SETVERSION: 165 | cmd = FS_IOC_SETVERSION; 166 | break; 167 | default: 168 | return -ENOIOCTLCMD; 169 | } 170 | return pmfs_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); 171 | } 172 | #endif 173 | -------------------------------------------------------------------------------- /balloc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * PMFS emulated persistence. This file contains code to 3 | * handle data blocks of various sizes efficiently. 4 | * 5 | * Persistent Memory File System 6 | * Copyright (c) 2012-2013, Intel Corporation. 7 | * 8 | * This program is free software; you can redistribute it and/or modify it 9 | * under the terms and conditions of the GNU General Public License, 10 | * version 2, as published by the Free Software Foundation. 11 | * 12 | * This program is distributed in the hope it will be useful, but WITHOUT 13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 | * more details. 16 | * 17 | * You should have received a copy of the GNU General Public License along with 18 | * this program; if not, write to the Free Software Foundation, Inc., 19 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 20 | */ 21 | 22 | #include 23 | #include 24 | #include "pmfs.h" 25 | 26 | void pmfs_init_blockmap(struct super_block *sb, unsigned long init_used_size) 27 | { 28 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 29 | unsigned long num_used_block; 30 | struct pmfs_blocknode *blknode; 31 | 32 | num_used_block = (init_used_size + sb->s_blocksize - 1) >> 33 | sb->s_blocksize_bits; 34 | 35 | blknode = pmfs_alloc_blocknode(sb); 36 | if (blknode == NULL) 37 | PMFS_ASSERT(0); 38 | blknode->block_low = sbi->block_start; 39 | blknode->block_high = sbi->block_start + num_used_block - 1; 40 | sbi->num_free_blocks -= num_used_block; 41 | list_add(&blknode->link, &sbi->block_inuse_head); 42 | } 43 | 44 | static struct pmfs_blocknode *pmfs_next_blocknode(struct pmfs_blocknode *i, 45 | struct list_head *head) 46 | { 47 | if (list_is_last(&i->link, head)) 48 | return NULL; 49 | return list_first_entry(&i->link, typeof(*i), link); 50 | } 51 | 52 | /* Caller must hold the super_block lock. If start_hint is provided, it is 53 | * only valid until the caller releases the super_block lock. */ 54 | void __pmfs_free_block(struct super_block *sb, unsigned long blocknr, 55 | unsigned short btype, struct pmfs_blocknode **start_hint) 56 | { 57 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 58 | struct list_head *head = &(sbi->block_inuse_head); 59 | unsigned long new_block_low; 60 | unsigned long new_block_high; 61 | unsigned long num_blocks = 0; 62 | struct pmfs_blocknode *i; 63 | struct pmfs_blocknode *free_blocknode= NULL; 64 | struct pmfs_blocknode *curr_node; 65 | 66 | num_blocks = pmfs_get_numblocks(btype); 67 | new_block_low = blocknr; 68 | new_block_high = blocknr + num_blocks - 1; 69 | 70 | BUG_ON(list_empty(head)); 71 | 72 | if (start_hint && *start_hint && 73 | new_block_low >= (*start_hint)->block_low) 74 | i = *start_hint; 75 | else 76 | i = list_first_entry(head, typeof(*i), link); 77 | 78 | list_for_each_entry_from(i, head, link) { 79 | 80 | if (new_block_low > i->block_high) { 81 | /* skip to next blocknode */ 82 | continue; 83 | } 84 | 85 | if ((new_block_low == i->block_low) && 86 | (new_block_high == i->block_high)) { 87 | /* fits entire datablock */ 88 | if (start_hint) 89 | *start_hint = pmfs_next_blocknode(i, head); 90 | list_del(&i->link); 91 | free_blocknode = i; 92 | sbi->num_blocknode_allocated--; 93 | sbi->num_free_blocks += num_blocks; 94 | goto block_found; 95 | } 96 | if ((new_block_low == i->block_low) && 97 | (new_block_high < i->block_high)) { 98 | /* Aligns left */ 99 | i->block_low = new_block_high + 1; 100 | sbi->num_free_blocks += num_blocks; 101 | if (start_hint) 102 | *start_hint = i; 103 | goto block_found; 104 | } 105 | if ((new_block_low > i->block_low) && 106 | (new_block_high == i->block_high)) { 107 | /* Aligns right */ 108 | i->block_high = new_block_low - 1; 109 | sbi->num_free_blocks += num_blocks; 110 | if (start_hint) 111 | *start_hint = pmfs_next_blocknode(i, head); 112 | goto block_found; 113 | } 114 | if ((new_block_low > i->block_low) && 115 | (new_block_high < i->block_high)) { 116 | /* Aligns somewhere in the middle */ 117 | curr_node = pmfs_alloc_blocknode(sb); 118 | PMFS_ASSERT(curr_node); 119 | if (curr_node == NULL) { 120 | /* returning without freeing the block*/ 121 | goto block_found; 122 | } 123 | curr_node->block_low = new_block_high + 1; 124 | curr_node->block_high = i->block_high; 125 | i->block_high = new_block_low - 1; 126 | list_add(&curr_node->link, &i->link); 127 | sbi->num_free_blocks += num_blocks; 128 | if (start_hint) 129 | *start_hint = curr_node; 130 | goto block_found; 131 | } 132 | } 133 | 134 | pmfs_error_mng(sb, "Unable to free block %ld\n", blocknr); 135 | 136 | block_found: 137 | 138 | if (free_blocknode) 139 | __pmfs_free_blocknode(free_blocknode); 140 | } 141 | 142 | void pmfs_free_block(struct super_block *sb, unsigned long blocknr, 143 | unsigned short btype) 144 | { 145 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 146 | mutex_lock(&sbi->s_lock); 147 | __pmfs_free_block(sb, blocknr, btype, NULL); 148 | mutex_unlock(&sbi->s_lock); 149 | } 150 | 151 | int pmfs_new_block(struct super_block *sb, unsigned long *blocknr, 152 | unsigned short btype, int zero) 153 | { 154 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 155 | struct list_head *head = &(sbi->block_inuse_head); 156 | struct pmfs_blocknode *i, *next_i; 157 | struct pmfs_blocknode *free_blocknode= NULL; 158 | void *bp; 159 | unsigned long num_blocks = 0; 160 | struct pmfs_blocknode *curr_node; 161 | int errval = 0; 162 | bool found = 0; 163 | unsigned long next_block_low; 164 | unsigned long new_block_low; 165 | unsigned long new_block_high; 166 | 167 | num_blocks = pmfs_get_numblocks(btype); 168 | 169 | mutex_lock(&sbi->s_lock); 170 | 171 | list_for_each_entry(i, head, link) { 172 | if (i->link.next == head) { 173 | next_i = NULL; 174 | next_block_low = sbi->block_end; 175 | } else { 176 | next_i = list_entry(i->link.next, typeof(*i), link); 177 | next_block_low = next_i->block_low; 178 | } 179 | 180 | new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1); 181 | new_block_high = new_block_low + num_blocks - 1; 182 | 183 | if (new_block_high >= next_block_low) { 184 | /* Does not fit - skip to next blocknode */ 185 | continue; 186 | } 187 | 188 | if ((new_block_low == (i->block_high + 1)) && 189 | (new_block_high == (next_block_low - 1))) 190 | { 191 | /* Fill the gap completely */ 192 | if (next_i) { 193 | i->block_high = next_i->block_high; 194 | list_del(&next_i->link); 195 | free_blocknode = next_i; 196 | sbi->num_blocknode_allocated--; 197 | } else { 198 | i->block_high = new_block_high; 199 | } 200 | found = 1; 201 | break; 202 | } 203 | 204 | if ((new_block_low == (i->block_high + 1)) && 205 | (new_block_high < (next_block_low - 1))) { 206 | /* Aligns to left */ 207 | i->block_high = new_block_high; 208 | found = 1; 209 | break; 210 | } 211 | 212 | if ((new_block_low > (i->block_high + 1)) && 213 | (new_block_high == (next_block_low - 1))) { 214 | /* Aligns to right */ 215 | if (next_i) { 216 | /* right node exist */ 217 | next_i->block_low = new_block_low; 218 | } else { 219 | /* right node does NOT exist */ 220 | curr_node = pmfs_alloc_blocknode(sb); 221 | PMFS_ASSERT(curr_node); 222 | if (curr_node == NULL) { 223 | errval = -ENOSPC; 224 | break; 225 | } 226 | curr_node->block_low = new_block_low; 227 | curr_node->block_high = new_block_high; 228 | list_add(&curr_node->link, &i->link); 229 | } 230 | found = 1; 231 | break; 232 | } 233 | 234 | if ((new_block_low > (i->block_high + 1)) && 235 | (new_block_high < (next_block_low - 1))) { 236 | /* Aligns somewhere in the middle */ 237 | curr_node = pmfs_alloc_blocknode(sb); 238 | PMFS_ASSERT(curr_node); 239 | if (curr_node == NULL) { 240 | errval = -ENOSPC; 241 | break; 242 | } 243 | curr_node->block_low = new_block_low; 244 | curr_node->block_high = new_block_high; 245 | list_add(&curr_node->link, &i->link); 246 | found = 1; 247 | break; 248 | } 249 | } 250 | 251 | if (found == 1) { 252 | sbi->num_free_blocks -= num_blocks; 253 | } 254 | 255 | mutex_unlock(&sbi->s_lock); 256 | 257 | if (free_blocknode) 258 | __pmfs_free_blocknode(free_blocknode); 259 | 260 | if (found == 0) { 261 | return -ENOSPC; 262 | } 263 | 264 | if (zero) { 265 | size_t size; 266 | bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype)); 267 | pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this 268 | if (btype == PMFS_BLOCK_TYPE_4K) 269 | size = 0x1 << 12; 270 | else if (btype == PMFS_BLOCK_TYPE_2M) 271 | size = 0x1 << 21; 272 | else 273 | size = 0x1 << 30; 274 | memset_nt(bp, 0, size); 275 | pmfs_memlock_block(sb, bp); 276 | } 277 | *blocknr = new_block_low; 278 | 279 | return errval; 280 | } 281 | 282 | unsigned long pmfs_count_free_blocks(struct super_block *sb) 283 | { 284 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 285 | return sbi->num_free_blocks; 286 | } 287 | -------------------------------------------------------------------------------- /dir.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * File operations for directories. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | 16 | #include 17 | #include 18 | #include "pmfs.h" 19 | 20 | /* 21 | * Parent is locked. 22 | */ 23 | 24 | #define DT2IF(dt) (((dt) << 12) & S_IFMT) 25 | #define IF2DT(sif) (((sif) & S_IFMT) >> 12) 26 | 27 | static int pmfs_add_dirent_to_buf(pmfs_transaction_t *trans, 28 | struct dentry *dentry, struct inode *inode, 29 | struct pmfs_direntry *de, u8 *blk_base, struct pmfs_inode *pidir) 30 | { 31 | struct inode *dir = dentry->d_parent->d_inode; 32 | const char *name = dentry->d_name.name; 33 | int namelen = dentry->d_name.len; 34 | unsigned short reclen; 35 | int nlen, rlen; 36 | char *top; 37 | 38 | reclen = PMFS_DIR_REC_LEN(namelen); 39 | if (!de) { 40 | de = (struct pmfs_direntry *)blk_base; 41 | top = blk_base + dir->i_sb->s_blocksize - reclen; 42 | while ((char *)de <= top) { 43 | #if 0 44 | if (!pmfs_check_dir_entry("pmfs_add_dirent_to_buf", 45 | dir, de, blk_base, offset)) 46 | return -EIO; 47 | if (pmfs_match(namelen, name, de)) 48 | return -EEXIST; 49 | #endif 50 | rlen = le16_to_cpu(de->de_len); 51 | if (de->ino) { 52 | nlen = PMFS_DIR_REC_LEN(de->name_len); 53 | if ((rlen - nlen) >= reclen) 54 | break; 55 | } else if (rlen >= reclen) 56 | break; 57 | de = (struct pmfs_direntry *)((char *)de + rlen); 58 | } 59 | if ((char *)de > top) 60 | return -ENOSPC; 61 | } 62 | rlen = le16_to_cpu(de->de_len); 63 | 64 | if (de->ino) { 65 | struct pmfs_direntry *de1; 66 | pmfs_add_logentry(dir->i_sb, trans, &de->de_len, 67 | sizeof(de->de_len), LE_DATA); 68 | nlen = PMFS_DIR_REC_LEN(de->name_len); 69 | de1 = (struct pmfs_direntry *)((char *)de + nlen); 70 | pmfs_memunlock_block(dir->i_sb, blk_base); 71 | de1->de_len = cpu_to_le16(rlen - nlen); 72 | de->de_len = cpu_to_le16(nlen); 73 | pmfs_memlock_block(dir->i_sb, blk_base); 74 | de = de1; 75 | } else { 76 | pmfs_add_logentry(dir->i_sb, trans, &de->ino, 77 | sizeof(de->ino), LE_DATA); 78 | } 79 | pmfs_memunlock_block(dir->i_sb, blk_base); 80 | /*de->file_type = 0;*/ 81 | if (inode) { 82 | de->ino = cpu_to_le64(inode->i_ino); 83 | /*de->file_type = IF2DT(inode->i_mode); */ 84 | } else { 85 | de->ino = 0; 86 | } 87 | de->name_len = namelen; 88 | memcpy(de->name, name, namelen); 89 | pmfs_memlock_block(dir->i_sb, blk_base); 90 | pmfs_flush_buffer(de, reclen, false); 91 | /* 92 | * XXX shouldn't update any times until successful 93 | * completion of syscall, but too many callers depend 94 | * on this. 95 | */ 96 | dir->i_mtime = dir->i_ctime = current_time(dir); 97 | /*dir->i_version++; */ 98 | 99 | pmfs_memunlock_inode(dir->i_sb, pidir); 100 | pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec); 101 | pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec); 102 | pmfs_memlock_inode(dir->i_sb, pidir); 103 | return 0; 104 | } 105 | 106 | /* adds a directory entry pointing to the inode. assumes the inode has 107 | * already been logged for consistency 108 | */ 109 | int pmfs_add_entry(pmfs_transaction_t *trans, struct dentry *dentry, 110 | struct inode *inode) 111 | { 112 | struct inode *dir = dentry->d_parent->d_inode; 113 | struct super_block *sb = dir->i_sb; 114 | int retval = -EINVAL; 115 | unsigned long block, blocks; 116 | struct pmfs_direntry *de; 117 | char *blk_base; 118 | struct pmfs_inode *pidir; 119 | 120 | if (!dentry->d_name.len) 121 | return -EINVAL; 122 | 123 | pidir = pmfs_get_inode(sb, dir->i_ino); 124 | pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA); 125 | 126 | blocks = dir->i_size >> sb->s_blocksize_bits; 127 | for (block = 0; block < blocks; block++) { 128 | blk_base = 129 | pmfs_get_block(sb, pmfs_find_data_block(dir, block)); 130 | if (!blk_base) { 131 | retval = -EIO; 132 | goto out; 133 | } 134 | retval = pmfs_add_dirent_to_buf(trans, dentry, inode, 135 | NULL, blk_base, pidir); 136 | if (retval != -ENOSPC) 137 | goto out; 138 | } 139 | retval = pmfs_alloc_blocks(trans, dir, blocks, 1, false); 140 | if (retval) 141 | goto out; 142 | 143 | dir->i_size += dir->i_sb->s_blocksize; 144 | pmfs_update_isize(dir, pidir); 145 | 146 | blk_base = pmfs_get_block(sb, pmfs_find_data_block(dir, blocks)); 147 | if (!blk_base) { 148 | retval = -ENOSPC; 149 | goto out; 150 | } 151 | /* No need to log the changes to this de because its a new block */ 152 | de = (struct pmfs_direntry *)blk_base; 153 | pmfs_memunlock_block(sb, blk_base); 154 | de->ino = 0; 155 | de->de_len = cpu_to_le16(sb->s_blocksize); 156 | pmfs_memlock_block(sb, blk_base); 157 | /* Since this is a new block, no need to log changes to this block */ 158 | retval = pmfs_add_dirent_to_buf(NULL, dentry, inode, de, blk_base, 159 | pidir); 160 | out: 161 | return retval; 162 | } 163 | 164 | /* removes a directory entry pointing to the inode. assumes the inode has 165 | * already been logged for consistency 166 | */ 167 | int pmfs_remove_entry(pmfs_transaction_t *trans, struct dentry *de, 168 | struct inode *inode) 169 | { 170 | struct super_block *sb = inode->i_sb; 171 | struct inode *dir = de->d_parent->d_inode; 172 | struct pmfs_inode *pidir; 173 | struct qstr *entry = &de->d_name; 174 | struct pmfs_direntry *res_entry, *prev_entry; 175 | int retval = -EINVAL; 176 | unsigned long blocks, block; 177 | char *blk_base = NULL; 178 | 179 | if (!de->d_name.len) 180 | return -EINVAL; 181 | 182 | blocks = dir->i_size >> sb->s_blocksize_bits; 183 | 184 | for (block = 0; block < blocks; block++) { 185 | blk_base = 186 | pmfs_get_block(sb, pmfs_find_data_block(dir, block)); 187 | if (!blk_base) 188 | goto out; 189 | if (pmfs_search_dirblock(blk_base, dir, entry, 190 | block << sb->s_blocksize_bits, 191 | &res_entry, &prev_entry) == 1) 192 | break; 193 | } 194 | 195 | if (block == blocks) 196 | goto out; 197 | if (prev_entry) { 198 | pmfs_add_logentry(sb, trans, &prev_entry->de_len, 199 | sizeof(prev_entry->de_len), LE_DATA); 200 | pmfs_memunlock_block(sb, blk_base); 201 | prev_entry->de_len = 202 | cpu_to_le16(le16_to_cpu(prev_entry->de_len) + 203 | le16_to_cpu(res_entry->de_len)); 204 | pmfs_memlock_block(sb, blk_base); 205 | } else { 206 | pmfs_add_logentry(sb, trans, &res_entry->ino, 207 | sizeof(res_entry->ino), LE_DATA); 208 | pmfs_memunlock_block(sb, blk_base); 209 | res_entry->ino = 0; 210 | pmfs_memlock_block(sb, blk_base); 211 | } 212 | /*dir->i_version++; */ 213 | dir->i_ctime = dir->i_mtime = current_time(dir); 214 | 215 | pidir = pmfs_get_inode(sb, dir->i_ino); 216 | pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA); 217 | 218 | pmfs_memunlock_inode(sb, pidir); 219 | pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec); 220 | pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec); 221 | pmfs_memlock_inode(sb, pidir); 222 | retval = 0; 223 | out: 224 | return retval; 225 | } 226 | 227 | static int pmfs_readdir(struct file *file, struct dir_context *ctx) 228 | { 229 | struct inode *inode = file_inode(file); 230 | struct super_block *sb = inode->i_sb; 231 | struct pmfs_inode *pi; 232 | char *blk_base; 233 | unsigned long offset; 234 | struct pmfs_direntry *de; 235 | ino_t ino; 236 | timing_t readdir_time; 237 | 238 | PMFS_START_TIMING(readdir_t, readdir_time); 239 | 240 | offset = ctx->pos & (sb->s_blocksize - 1); 241 | while (ctx->pos < inode->i_size) { 242 | unsigned long blk = ctx->pos >> sb->s_blocksize_bits; 243 | 244 | blk_base = 245 | pmfs_get_block(sb, pmfs_find_data_block(inode, blk)); 246 | if (!blk_base) { 247 | pmfs_dbg("directory %lu contains a hole at offset %lld\n", 248 | inode->i_ino, ctx->pos); 249 | ctx->pos += sb->s_blocksize - offset; 250 | continue; 251 | } 252 | #if 0 253 | if (file->f_version != inode->i_version) { 254 | for (i = 0; i < sb->s_blocksize && i < offset; ) { 255 | de = (struct pmfs_direntry *)(blk_base + i); 256 | /* It's too expensive to do a full 257 | * dirent test each time round this 258 | * loop, but we do have to test at 259 | * least that it is non-zero. A 260 | * failure will be detected in the 261 | * dirent test below. */ 262 | if (le16_to_cpu(de->de_len) < 263 | PMFS_DIR_REC_LEN(1)) 264 | break; 265 | i += le16_to_cpu(de->de_len); 266 | } 267 | offset = i; 268 | ctx->pos = 269 | (ctx->pos & ~(sb->s_blocksize - 1)) | offset; 270 | file->f_version = inode->i_version; 271 | } 272 | #endif 273 | while (ctx->pos < inode->i_size 274 | && offset < sb->s_blocksize) { 275 | de = (struct pmfs_direntry *)(blk_base + offset); 276 | if (!pmfs_check_dir_entry("pmfs_readdir", inode, de, 277 | blk_base, offset)) { 278 | /* On error, skip to the next block. */ 279 | ctx->pos = ALIGN(ctx->pos, sb->s_blocksize); 280 | break; 281 | } 282 | offset += le16_to_cpu(de->de_len); 283 | if (de->ino) { 284 | ino = le64_to_cpu(de->ino); 285 | pi = pmfs_get_inode(sb, ino); 286 | if (!dir_emit(ctx, de->name, de->name_len, 287 | ino, IF2DT(le16_to_cpu(pi->i_mode)))) 288 | return 0; 289 | } 290 | ctx->pos += le16_to_cpu(de->de_len); 291 | } 292 | offset = 0; 293 | } 294 | PMFS_END_TIMING(readdir_t, readdir_time); 295 | return 0; 296 | } 297 | 298 | const struct file_operations pmfs_dir_operations = { 299 | .read = generic_read_dir, 300 | .iterate = pmfs_readdir, 301 | .fsync = noop_fsync, 302 | .unlocked_ioctl = pmfs_ioctl, 303 | #ifdef CONFIG_COMPAT 304 | .compat_ioctl = pmfs_compat_ioctl, 305 | #endif 306 | }; 307 | -------------------------------------------------------------------------------- /file.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * File operations for files. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "pmfs.h" 25 | #include "xip.h" 26 | 27 | static inline int pmfs_can_set_blocksize_hint(struct pmfs_inode *pi, 28 | loff_t new_size) 29 | { 30 | /* Currently, we don't deallocate data blocks till the file is deleted. 31 | * So no changing blocksize hints once allocation is done. */ 32 | if (le64_to_cpu(pi->root)) 33 | return 0; 34 | return 1; 35 | } 36 | 37 | int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi, 38 | loff_t new_size) 39 | { 40 | unsigned short block_type; 41 | 42 | if (!pmfs_can_set_blocksize_hint(pi, new_size)) 43 | return 0; 44 | 45 | if (new_size >= 0x40000000) { /* 1G */ 46 | block_type = PMFS_BLOCK_TYPE_1G; 47 | goto hint_set; 48 | } 49 | 50 | if (new_size >= 0x200000) { /* 2M */ 51 | block_type = PMFS_BLOCK_TYPE_2M; 52 | goto hint_set; 53 | } 54 | 55 | /* defaulting to 4K */ 56 | block_type = PMFS_BLOCK_TYPE_4K; 57 | 58 | hint_set: 59 | pmfs_dbg_verbose( 60 | "Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n", 61 | new_size, pi->i_size, le64_to_cpu(pi->root)); 62 | pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type); 63 | pmfs_memunlock_inode(sb, pi); 64 | pi->i_blk_type = block_type; 65 | pmfs_memlock_inode(sb, pi); 66 | return 0; 67 | } 68 | 69 | static long pmfs_fallocate(struct file *file, int mode, loff_t offset, 70 | loff_t len) 71 | { 72 | struct inode *inode = file->f_path.dentry->d_inode; 73 | struct super_block *sb = inode->i_sb; 74 | long ret = 0; 75 | unsigned long blocknr, blockoff; 76 | int num_blocks, blocksize_mask; 77 | struct pmfs_inode *pi; 78 | pmfs_transaction_t *trans; 79 | loff_t new_size; 80 | 81 | /* We only support the FALLOC_FL_KEEP_SIZE mode */ 82 | if (mode & ~FALLOC_FL_KEEP_SIZE) 83 | return -EOPNOTSUPP; 84 | 85 | if (S_ISDIR(inode->i_mode)) 86 | return -ENODEV; 87 | 88 | inode_lock(inode); 89 | 90 | new_size = len + offset; 91 | if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) { 92 | ret = inode_newsize_ok(inode, new_size); 93 | if (ret) 94 | goto out; 95 | } 96 | 97 | pi = pmfs_get_inode(sb, inode->i_ino); 98 | if (!pi) { 99 | ret = -EACCES; 100 | goto out; 101 | } 102 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + 103 | MAX_METABLOCK_LENTRIES); 104 | if (IS_ERR(trans)) { 105 | ret = PTR_ERR(trans); 106 | goto out; 107 | } 108 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 109 | 110 | /* Set the block size hint */ 111 | pmfs_set_blocksize_hint(sb, pi, new_size); 112 | 113 | blocksize_mask = sb->s_blocksize - 1; 114 | blocknr = offset >> sb->s_blocksize_bits; 115 | blockoff = offset & blocksize_mask; 116 | num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits; 117 | ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true); 118 | 119 | inode->i_mtime = inode->i_ctime = current_time(inode); 120 | 121 | pmfs_memunlock_inode(sb, pi); 122 | if (ret || (mode & FALLOC_FL_KEEP_SIZE)) { 123 | pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL); 124 | } 125 | 126 | if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) { 127 | inode->i_size = new_size; 128 | pi->i_size = cpu_to_le64(inode->i_size); 129 | } 130 | pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); 131 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 132 | pmfs_memlock_inode(sb, pi); 133 | 134 | pmfs_commit_transaction(sb, trans); 135 | 136 | out: 137 | inode_unlock(inode); 138 | return ret; 139 | } 140 | 141 | static loff_t pmfs_llseek(struct file *file, loff_t offset, int origin) 142 | { 143 | struct inode *inode = file->f_path.dentry->d_inode; 144 | int retval; 145 | 146 | if (origin != SEEK_DATA && origin != SEEK_HOLE) 147 | return generic_file_llseek(file, offset, origin); 148 | 149 | inode_lock(inode); 150 | switch (origin) { 151 | case SEEK_DATA: 152 | retval = pmfs_find_region(inode, &offset, 0); 153 | if (retval) { 154 | inode_unlock(inode); 155 | return retval; 156 | } 157 | break; 158 | case SEEK_HOLE: 159 | retval = pmfs_find_region(inode, &offset, 1); 160 | if (retval) { 161 | inode_unlock(inode); 162 | return retval; 163 | } 164 | break; 165 | } 166 | 167 | if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) || 168 | offset > inode->i_sb->s_maxbytes) { 169 | inode_unlock(inode); 170 | return -EINVAL; 171 | } 172 | 173 | if (offset != file->f_pos) { 174 | file->f_pos = offset; 175 | file->f_version = 0; 176 | } 177 | 178 | inode_unlock(inode); 179 | return offset; 180 | } 181 | 182 | /* This function is called by both msync() and fsync(). 183 | * TODO: Check if we can avoid calling pmfs_flush_buffer() for fsync. We use 184 | * movnti to write data to files, so we may want to avoid doing unnecessary 185 | * pmfs_flush_buffer() on fsync() */ 186 | int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 187 | { 188 | /* Sync from start to end[inclusive] */ 189 | struct address_space *mapping = file->f_mapping; 190 | struct inode *inode = mapping->host; 191 | loff_t isize; 192 | timing_t fsync_time; 193 | 194 | PMFS_START_TIMING(fsync_t, fsync_time); 195 | /* if the file is not mmap'ed, there is no need to do clflushes */ 196 | if (mapping_mapped(mapping) == 0) 197 | goto persist; 198 | 199 | end += 1; /* end is inclusive. We like our indices normal please ! */ 200 | 201 | isize = i_size_read(inode); 202 | 203 | if ((unsigned long)end > (unsigned long)isize) 204 | end = isize; 205 | if (!isize || (start >= end)) 206 | { 207 | pmfs_dbg_verbose("[%s:%d] : (ERR) isize(%llx), start(%llx)," 208 | " end(%llx)\n", __func__, __LINE__, isize, start, end); 209 | PMFS_END_TIMING(fsync_t, fsync_time); 210 | return -ENODATA; 211 | } 212 | 213 | /* Align start and end to cacheline boundaries */ 214 | start = start & CACHELINE_MASK; 215 | end = CACHELINE_ALIGN(end); 216 | do { 217 | sector_t block = 0; 218 | void *xip_mem; 219 | pgoff_t pgoff; 220 | loff_t offset; 221 | unsigned long nr_flush_bytes; 222 | 223 | pgoff = start >> PAGE_SHIFT; 224 | offset = start & ~PAGE_MASK; 225 | 226 | nr_flush_bytes = PAGE_SIZE - offset; 227 | if (nr_flush_bytes > (end - start)) 228 | nr_flush_bytes = end - start; 229 | 230 | block = pmfs_find_data_block(inode, (sector_t)pgoff); 231 | 232 | if (block) { 233 | xip_mem = pmfs_get_block(inode->i_sb, block); 234 | /* flush the range */ 235 | atomic64_inc(&fsync_pages); 236 | pmfs_flush_buffer(xip_mem + offset, nr_flush_bytes, 0); 237 | } else { 238 | /* sparse files could have such holes */ 239 | pmfs_dbg_verbose("[%s:%d] : start(%llx), end(%llx)," 240 | " pgoff(%lx)\n", __func__, __LINE__, start, end, pgoff); 241 | break; 242 | } 243 | 244 | start += nr_flush_bytes; 245 | } while (start < end); 246 | persist: 247 | PERSISTENT_MARK(); 248 | PERSISTENT_BARRIER(); 249 | PMFS_END_TIMING(fsync_t, fsync_time); 250 | return 0; 251 | } 252 | 253 | /* This callback is called when a file is closed */ 254 | static int pmfs_flush(struct file *file, fl_owner_t id) 255 | { 256 | int ret = 0; 257 | /* if the file was opened for writing, make it persistent. 258 | * TODO: Should we be more smart to check if the file was modified? */ 259 | if (file->f_mode & FMODE_WRITE) { 260 | PERSISTENT_MARK(); 261 | PERSISTENT_BARRIER(); 262 | } 263 | 264 | return ret; 265 | } 266 | 267 | #if 0 268 | static unsigned long 269 | pmfs_get_unmapped_area(struct file *file, unsigned long addr, 270 | unsigned long len, unsigned long pgoff, 271 | unsigned long flags) 272 | { 273 | unsigned long align_size; 274 | struct vm_area_struct *vma; 275 | struct mm_struct *mm = current->mm; 276 | struct inode *inode = file->f_mapping->host; 277 | struct pmfs_inode *pi = pmfs_get_inode(inode->i_sb, inode->i_ino); 278 | struct vm_unmapped_area_info info; 279 | 280 | if (len > TASK_SIZE) 281 | return -ENOMEM; 282 | 283 | if (pi->i_blk_type == PMFS_BLOCK_TYPE_1G) 284 | align_size = PUD_SIZE; 285 | else if (pi->i_blk_type == PMFS_BLOCK_TYPE_2M) 286 | align_size = PMD_SIZE; 287 | else 288 | align_size = PAGE_SIZE; 289 | 290 | if (flags & MAP_FIXED) { 291 | /* FIXME: We could use 4K mappings as fallback. */ 292 | if (len & (align_size - 1)) 293 | return -EINVAL; 294 | if (addr & (align_size - 1)) 295 | return -EINVAL; 296 | return addr; 297 | } 298 | 299 | if (addr) { 300 | addr = ALIGN(addr, align_size); 301 | vma = find_vma(mm, addr); 302 | if (TASK_SIZE - len >= addr && 303 | (!vma || addr + len <= vma->vm_start)) 304 | return addr; 305 | } 306 | 307 | /* 308 | * FIXME: Using the following values for low_limit and high_limit 309 | * implicitly disables ASLR. Awaiting a better way to have this fixed. 310 | */ 311 | info.flags = 0; 312 | info.length = len; 313 | info.low_limit = TASK_UNMAPPED_BASE; 314 | info.high_limit = TASK_SIZE; 315 | info.align_mask = align_size - 1; 316 | info.align_offset = 0; 317 | return vm_unmapped_area(&info); 318 | } 319 | #endif 320 | 321 | const struct file_operations pmfs_xip_file_operations = { 322 | .llseek = pmfs_llseek, 323 | .read = pmfs_xip_file_read, 324 | .write = pmfs_xip_file_write, 325 | // .aio_read = xip_file_aio_read, 326 | // .aio_write = xip_file_aio_write, 327 | // .read_iter = generic_file_read_iter, 328 | // .write_iter = generic_file_write_iter, 329 | .mmap = pmfs_xip_file_mmap, 330 | .open = generic_file_open, 331 | .fsync = pmfs_fsync, 332 | .flush = pmfs_flush, 333 | // .get_unmapped_area = pmfs_get_unmapped_area, 334 | .unlocked_ioctl = pmfs_ioctl, 335 | .fallocate = pmfs_fallocate, 336 | #ifdef CONFIG_COMPAT 337 | .compat_ioctl = pmfs_compat_ioctl, 338 | #endif 339 | }; 340 | 341 | const struct inode_operations pmfs_file_inode_operations = { 342 | .setattr = pmfs_notify_change, 343 | .getattr = pmfs_getattr, 344 | .get_acl = NULL, 345 | }; 346 | -------------------------------------------------------------------------------- /pmfs_def.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FILE NAME include/linux/pmfs_fs.h 3 | * 4 | * BRIEF DESCRIPTION 5 | * 6 | * Definitions for the PMFS filesystem. 7 | * 8 | * Copyright 2012-2013 Intel Corporation 9 | * Copyright 2009-2011 Marco Stornelli 10 | * Copyright 2003 Sony Corporation 11 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 12 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 13 | * This file is licensed under the terms of the GNU General Public 14 | * License version 2. This program is licensed "as is" without any 15 | * warranty of any kind, whether express or implied. 16 | */ 17 | #ifndef _LINUX_PMFS_DEF_H 18 | #define _LINUX_PMFS_DEF_H 19 | 20 | #include 21 | #include 22 | 23 | #define PMFS_SUPER_MAGIC 0xEFFC 24 | 25 | /* 26 | * The PMFS filesystem constants/structures 27 | */ 28 | 29 | /* 30 | * Mount flags 31 | */ 32 | #define PMFS_MOUNT_PROTECT 0x000001 /* wprotect CR0.WP */ 33 | #define PMFS_MOUNT_XATTR_USER 0x000002 /* Extended user attributes */ 34 | #define PMFS_MOUNT_POSIX_ACL 0x000004 /* POSIX Access Control Lists */ 35 | #define PMFS_MOUNT_XIP 0x000008 /* Execute in place */ 36 | #define PMFS_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */ 37 | #define PMFS_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */ 38 | #define PMFS_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */ 39 | #define PMFS_MOUNT_HUGEMMAP 0x000080 /* Huge mappings with mmap */ 40 | #define PMFS_MOUNT_HUGEIOREMAP 0x000100 /* Huge mappings with ioremap */ 41 | #define PMFS_MOUNT_PROTECT_OLD 0x000200 /* wprotect PAGE RW Bit */ 42 | #define PMFS_MOUNT_FORMAT 0x000400 /* was FS formatted on mount? */ 43 | #define PMFS_MOUNT_MOUNTING 0x000800 /* FS currently being mounted */ 44 | 45 | /* 46 | * Maximal count of links to a file 47 | */ 48 | #define PMFS_LINK_MAX 32000 49 | 50 | #define PMFS_DEF_BLOCK_SIZE_4K 4096 51 | 52 | #define PMFS_INODE_SIZE 128 /* must be power of two */ 53 | #define PMFS_INODE_BITS 7 54 | 55 | #define PMFS_NAME_LEN 255 56 | /* 57 | * Structure of a directory entry in PMFS. 58 | */ 59 | struct pmfs_direntry { 60 | __le64 ino; /* inode no pointed to by this entry */ 61 | __le16 de_len; /* length of this directory entry */ 62 | u8 name_len; /* length of the directory entry name */ 63 | u8 file_type; /* file type */ 64 | char name[PMFS_NAME_LEN]; /* File name */ 65 | }; 66 | 67 | #define PMFS_DIR_PAD 4 68 | #define PMFS_DIR_ROUND (PMFS_DIR_PAD - 1) 69 | #define PMFS_DIR_REC_LEN(name_len) (((name_len) + 12 + PMFS_DIR_ROUND) & \ 70 | ~PMFS_DIR_ROUND) 71 | 72 | /* PMFS supported data blocks */ 73 | #define PMFS_BLOCK_TYPE_4K 0 74 | #define PMFS_BLOCK_TYPE_2M 1 75 | #define PMFS_BLOCK_TYPE_1G 2 76 | #define PMFS_BLOCK_TYPE_MAX 3 77 | 78 | #define META_BLK_SHIFT 9 79 | 80 | /* 81 | * Play with this knob to change the default block type. 82 | * By changing the PMFS_DEFAULT_BLOCK_TYPE to 2M or 1G, 83 | * we should get pretty good coverage in testing. 84 | */ 85 | #define PMFS_DEFAULT_BLOCK_TYPE PMFS_BLOCK_TYPE_4K 86 | 87 | /* 88 | * Structure of an inode in PMFS. Things to keep in mind when modifying it. 89 | * 1) Keep the inode size to within 96 bytes if possible. This is because 90 | * a 64 byte log-entry can store 48 bytes of data and we would like 91 | * to log an inode using only 2 log-entries 92 | * 2) root must be immediately after the qw containing height because we update 93 | * root and height atomically using cmpxchg16b in pmfs_decrease_btree_height 94 | * 3) i_size, i_ctime, and i_mtime must be in that order and i_size must be at 95 | * 16 byte aligned offset from the start of the inode. We use cmpxchg16b to 96 | * update these three fields atomically. 97 | */ 98 | struct pmfs_inode { 99 | /* first 48 bytes */ 100 | __le16 i_rsvd; /* reserved. used to be checksum */ 101 | u8 height; /* height of data b-tree; max 3 for now */ 102 | u8 i_blk_type; /* data block size this inode uses */ 103 | __le32 i_flags; /* Inode flags */ 104 | __le64 root; /* btree root. must be below qw w/ height */ 105 | __le64 i_size; /* Size of data in bytes */ 106 | __le32 i_ctime; /* Inode modification time */ 107 | __le32 i_mtime; /* Inode b-tree Modification time */ 108 | __le32 i_dtime; /* Deletion Time */ 109 | __le16 i_mode; /* File mode */ 110 | __le16 i_links_count; /* Links count */ 111 | __le64 i_blocks; /* Blocks count */ 112 | 113 | /* second 48 bytes */ 114 | __le64 i_xattr; /* Extended attribute block */ 115 | __le32 i_uid; /* Owner Uid */ 116 | __le32 i_gid; /* Group Id */ 117 | __le32 i_generation; /* File version (for NFS) */ 118 | __le32 i_atime; /* Access time */ 119 | 120 | struct { 121 | __le32 rdev; /* major/minor # */ 122 | } dev; /* device inode */ 123 | __le32 padding; /* pad to ensure truncate_item starts 8-byte aligned */ 124 | }; 125 | 126 | /* This is a per-inode structure and follows immediately after the 127 | * struct pmfs_inode. It is used to implement the truncate linked list and is 128 | * by pmfs_truncate_add(), pmfs_truncate_del(), and pmfs_recover_truncate_list() 129 | * functions to manage the truncate list */ 130 | struct pmfs_inode_truncate_item { 131 | __le64 i_truncatesize; /* Size of truncated inode */ 132 | __le64 i_next_truncate; /* inode num of the next truncated inode */ 133 | }; 134 | 135 | /* 136 | * #define PMFS_NAME_LEN (PMFS_INODE_SIZE - offsetof(struct pmfs_inode, 137 | * i_d.d_name) - 1) 138 | */ 139 | 140 | /* #define PMFS_SB_SIZE 128 */ /* must be power of two */ 141 | #define PMFS_SB_SIZE 512 /* must be power of two */ 142 | 143 | typedef struct pmfs_journal { 144 | __le64 base; 145 | __le32 size; 146 | __le32 head; 147 | /* the next three fields must be in the same order and together. 148 | * tail and gen_id must fall in the same 8-byte quadword */ 149 | __le32 tail; 150 | __le16 gen_id; /* generation id of the log */ 151 | __le16 pad; 152 | __le16 redo_logging; 153 | } pmfs_journal_t; 154 | 155 | 156 | /* 157 | * Structure of the super block in PMFS 158 | * The fields are partitioned into static and dynamic fields. The static fields 159 | * never change after file system creation. This was primarily done because 160 | * pmfs_get_block() returns NULL if the block offset is 0 (helps in catching 161 | * bugs). So if we modify any field using journaling (for consistency), we 162 | * will have to modify s_sum which is at offset 0. So journaling code fails. 163 | * This (static+dynamic fields) is a temporary solution and can be avoided 164 | * once the file system becomes stable and pmfs_get_block() returns correct 165 | * pointers even for offset 0. 166 | */ 167 | struct pmfs_super_block { 168 | /* static fields. they never change after file system creation. 169 | * checksum only validates up to s_start_dynamic field below */ 170 | __le16 s_sum; /* checksum of this sb */ 171 | __le16 s_magic; /* magic signature */ 172 | __le32 s_blocksize; /* blocksize in bytes */ 173 | __le64 s_size; /* total size of fs in bytes */ 174 | char s_volume_name[16]; /* volume name */ 175 | /* points to the location of pmfs_journal_t */ 176 | __le64 s_journal_offset; 177 | /* points to the location of struct pmfs_inode for the inode table */ 178 | __le64 s_inode_table_offset; 179 | 180 | __le64 s_start_dynamic; 181 | 182 | /* all the dynamic fields should go here */ 183 | /* s_mtime and s_wtime should be together and their order should not be 184 | * changed. we use an 8 byte write to update both of them atomically */ 185 | __le32 s_mtime; /* mount time */ 186 | __le32 s_wtime; /* write time */ 187 | /* fields for fast mount support. Always keep them together */ 188 | __le64 s_num_blocknode_allocated; 189 | __le64 s_num_free_blocks; 190 | __le32 s_inodes_count; 191 | __le32 s_free_inodes_count; 192 | __le32 s_inodes_used_count; 193 | __le32 s_free_inode_hint; 194 | }; 195 | 196 | #define PMFS_SB_STATIC_SIZE(ps) ((u64)&ps->s_start_dynamic - (u64)ps) 197 | 198 | /* the above fast mount fields take total 32 bytes in the super block */ 199 | #define PMFS_FAST_MOUNT_FIELD_SIZE (36) 200 | 201 | /* The root inode follows immediately after the redundant super block */ 202 | #define PMFS_ROOT_INO (PMFS_INODE_SIZE) 203 | #define PMFS_BLOCKNODE_IN0 (PMFS_ROOT_INO + PMFS_INODE_SIZE) 204 | 205 | /* INODE HINT START at 3 */ 206 | #define PMFS_FREE_INODE_HINT_START (3) 207 | 208 | /* ======================= Write ordering ========================= */ 209 | 210 | #define CACHELINE_SIZE (64) 211 | #define CACHELINE_MASK (~(CACHELINE_SIZE - 1)) 212 | #define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK) 213 | 214 | #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ 215 | #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 216 | #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ 217 | 218 | static inline bool arch_has_pcommit(void) 219 | { 220 | return static_cpu_has(X86_FEATURE_PCOMMIT); 221 | } 222 | 223 | static inline bool arch_has_clwb(void) 224 | { 225 | return static_cpu_has(X86_FEATURE_CLWB); 226 | } 227 | 228 | extern int support_clwb; 229 | extern int support_pcommit; 230 | 231 | #define _mm_clflush(addr)\ 232 | asm volatile("clflush %0" : "+m" (*(volatile char *)(addr))) 233 | #define _mm_clflushopt(addr)\ 234 | asm volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr))) 235 | #define _mm_clwb(addr)\ 236 | asm volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr))) 237 | #define _mm_pcommit()\ 238 | asm volatile(".byte 0x66, 0x0f, 0xae, 0xf8") 239 | 240 | /* Provides ordering from all previous clflush too */ 241 | static inline void PERSISTENT_MARK(void) 242 | { 243 | /* TODO: Fix me. */ 244 | } 245 | 246 | static inline void PERSISTENT_BARRIER(void) 247 | { 248 | asm volatile ("sfence\n" : : ); 249 | if (support_pcommit) { 250 | /* Do nothing */ 251 | } 252 | } 253 | 254 | static inline void pmfs_flush_buffer(void *buf, uint32_t len, bool fence) 255 | { 256 | uint32_t i; 257 | len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1)); 258 | if (support_clwb) { 259 | for (i = 0; i < len; i += CACHELINE_SIZE) 260 | _mm_clwb(buf + i); 261 | } else { 262 | for (i = 0; i < len; i += CACHELINE_SIZE) 263 | _mm_clflush(buf + i); 264 | } 265 | /* Do a fence only if asked. We often don't need to do a fence 266 | * immediately after clflush because even if we get context switched 267 | * between clflush and subsequent fence, the context switch operation 268 | * provides implicit fence. */ 269 | if (fence) 270 | PERSISTENT_BARRIER(); 271 | } 272 | 273 | #endif /* _LINUX_PMFS_DEF_H */ 274 | -------------------------------------------------------------------------------- /bbuild.c: -------------------------------------------------------------------------------- 1 | /* 2 | * PMFS emulated persistence. This file contains code to 3 | * handle data blocks of various sizes efficiently. 4 | * 5 | * Persistent Memory File System 6 | * Copyright (c) 2012-2013, Intel Corporation. 7 | * 8 | * This program is free software; you can redistribute it and/or modify it 9 | * under the terms and conditions of the GNU General Public License, 10 | * version 2, as published by the Free Software Foundation. 11 | * 12 | * This program is distributed in the hope it will be useful, but WITHOUT 13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 | * more details. 16 | * 17 | * You should have received a copy of the GNU General Public License along with 18 | * this program; if not, write to the Free Software Foundation, Inc., 19 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include "pmfs.h" 26 | 27 | struct scan_bitmap { 28 | unsigned long bitmap_4k_size; 29 | unsigned long bitmap_2M_size; 30 | unsigned long bitmap_1G_size; 31 | unsigned long *bitmap_4k; 32 | unsigned long *bitmap_2M; 33 | unsigned long *bitmap_1G; 34 | }; 35 | 36 | static void pmfs_clear_datablock_inode(struct super_block *sb) 37 | { 38 | struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0); 39 | pmfs_transaction_t *trans; 40 | 41 | /* 2 log entry for inode */ 42 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES); 43 | if (IS_ERR(trans)) 44 | return; 45 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 46 | 47 | pmfs_memunlock_inode(sb, pi); 48 | memset(pi, 0, MAX_DATA_PER_LENTRY); 49 | pmfs_memlock_inode(sb, pi); 50 | 51 | /* commit the transaction */ 52 | pmfs_commit_transaction(sb, trans); 53 | } 54 | 55 | static void pmfs_init_blockmap_from_inode(struct super_block *sb) 56 | { 57 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 58 | struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0); 59 | struct pmfs_blocknode_lowhigh *p = NULL; 60 | struct pmfs_blocknode *blknode; 61 | unsigned long index; 62 | unsigned long blocknr; 63 | unsigned long i; 64 | unsigned long num_blocknode; 65 | u64 bp; 66 | 67 | num_blocknode = sbi->num_blocknode_allocated; 68 | sbi->num_blocknode_allocated = 0; 69 | for (i=0; i> 8; /* 256 Entries in a block */ 74 | bp = __pmfs_find_data_block(sb, pi, blocknr); 75 | p = pmfs_get_block(sb, bp); 76 | } 77 | PMFS_ASSERT(p); 78 | blknode = pmfs_alloc_blocknode(sb); 79 | if (blknode == NULL) 80 | PMFS_ASSERT(0); 81 | blknode->block_low = le64_to_cpu(p[index].block_low); 82 | blknode->block_high = le64_to_cpu(p[index].block_high); 83 | list_add_tail(&blknode->link, &sbi->block_inuse_head); 84 | } 85 | } 86 | 87 | static bool pmfs_can_skip_full_scan(struct super_block *sb) 88 | { 89 | struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0); 90 | struct pmfs_super_block *super = pmfs_get_super(sb); 91 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 92 | __le64 root; 93 | unsigned int height, btype; 94 | unsigned long last_blocknr; 95 | 96 | if (!pi->root) 97 | return false; 98 | 99 | sbi->num_blocknode_allocated = 100 | le64_to_cpu(super->s_num_blocknode_allocated); 101 | sbi->num_free_blocks = le64_to_cpu(super->s_num_free_blocks); 102 | sbi->s_inodes_count = le32_to_cpu(super->s_inodes_count); 103 | sbi->s_free_inodes_count = le32_to_cpu(super->s_free_inodes_count); 104 | sbi->s_inodes_used_count = le32_to_cpu(super->s_inodes_used_count); 105 | sbi->s_free_inode_hint = le32_to_cpu(super->s_free_inode_hint); 106 | 107 | pmfs_init_blockmap_from_inode(sb); 108 | 109 | root = pi->root; 110 | height = pi->height; 111 | btype = pi->i_blk_type; 112 | /* pi->i_size can not be zero */ 113 | last_blocknr = (le64_to_cpu(pi->i_size) - 1) >> 114 | pmfs_inode_blk_shift(pi); 115 | 116 | /* Clearing the datablock inode */ 117 | pmfs_clear_datablock_inode(sb); 118 | 119 | pmfs_free_inode_subtree(sb, root, height, btype, last_blocknr); 120 | 121 | return true; 122 | } 123 | 124 | 125 | static int pmfs_allocate_datablock_block_inode(pmfs_transaction_t *trans, 126 | struct super_block *sb, struct pmfs_inode *pi, unsigned long num_blocks) 127 | { 128 | int errval; 129 | 130 | pmfs_memunlock_inode(sb, pi); 131 | pi->i_mode = 0; 132 | pi->i_links_count = cpu_to_le16(1); 133 | pi->i_blk_type = PMFS_BLOCK_TYPE_4K; 134 | pi->i_flags = 0; 135 | pi->height = 0; 136 | pi->i_dtime = 0; 137 | pi->i_size = cpu_to_le64(num_blocks << sb->s_blocksize_bits); 138 | pmfs_memlock_inode(sb, pi); 139 | 140 | errval = __pmfs_alloc_blocks(trans, sb, pi, 0, num_blocks, false); 141 | 142 | return errval; 143 | } 144 | 145 | void pmfs_save_blocknode_mappings(struct super_block *sb) 146 | { 147 | unsigned long num_blocks, blocknr; 148 | struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0); 149 | struct pmfs_blocknode_lowhigh *p; 150 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 151 | struct list_head *head = &(sbi->block_inuse_head); 152 | struct pmfs_blocknode *i; 153 | struct pmfs_super_block *super; 154 | pmfs_transaction_t *trans; 155 | u64 bp; 156 | int j, k; 157 | int errval; 158 | 159 | num_blocks = ((sbi->num_blocknode_allocated * sizeof(struct 160 | pmfs_blocknode_lowhigh) - 1) >> sb->s_blocksize_bits) + 1; 161 | 162 | /* 2 log entry for inode, 2 lentry for super-block */ 163 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + MAX_SB_LENTRIES); 164 | if (IS_ERR(trans)) 165 | return; 166 | 167 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 168 | 169 | errval = pmfs_allocate_datablock_block_inode(trans, sb, pi, num_blocks); 170 | 171 | if (errval != 0) { 172 | pmfs_dbg("Error saving the blocknode mappings: %d\n", errval); 173 | pmfs_abort_transaction(sb, trans); 174 | return; 175 | } 176 | 177 | j = 0; 178 | k = 0; 179 | p = NULL; 180 | list_for_each_entry(i, head, link) { 181 | blocknr = k >> 8; 182 | if (j == 0) { 183 | /* Find, get and unlock new data block */ 184 | bp = __pmfs_find_data_block(sb, pi, blocknr); 185 | p = pmfs_get_block(sb, bp); 186 | pmfs_memunlock_block(sb, p); 187 | } 188 | p[j].block_low = cpu_to_le64(i->block_low); 189 | p[j].block_high = cpu_to_le64(i->block_high); 190 | j++; 191 | 192 | if (j == 256) { 193 | j = 0; 194 | /* Lock the data block */ 195 | pmfs_memlock_block(sb, p); 196 | pmfs_flush_buffer(p, 4096, false); 197 | } 198 | 199 | k++; 200 | } 201 | 202 | /* Lock the block */ 203 | if (j) { 204 | pmfs_flush_buffer(p, j << 4, false); 205 | pmfs_memlock_block(sb, p); 206 | } 207 | 208 | /* 209 | * save the total allocated blocknode mappings 210 | * in super block 211 | */ 212 | super = pmfs_get_super(sb); 213 | pmfs_add_logentry(sb, trans, &super->s_wtime, 214 | PMFS_FAST_MOUNT_FIELD_SIZE, LE_DATA); 215 | 216 | pmfs_memunlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE); 217 | 218 | super->s_wtime = cpu_to_le32(get_seconds()); 219 | super->s_num_blocknode_allocated = 220 | cpu_to_le64(sbi->num_blocknode_allocated); 221 | super->s_num_free_blocks = cpu_to_le64(sbi->num_free_blocks); 222 | super->s_inodes_count = cpu_to_le32(sbi->s_inodes_count); 223 | super->s_free_inodes_count = cpu_to_le32(sbi->s_free_inodes_count); 224 | super->s_inodes_used_count = cpu_to_le32(sbi->s_inodes_used_count); 225 | super->s_free_inode_hint = cpu_to_le32(sbi->s_free_inode_hint); 226 | 227 | pmfs_memlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE); 228 | /* commit the transaction */ 229 | pmfs_commit_transaction(sb, trans); 230 | } 231 | 232 | static void pmfs_inode_crawl_recursive(struct super_block *sb, 233 | struct scan_bitmap *bm, unsigned long block, 234 | u32 height, u8 btype) 235 | { 236 | __le64 *node; 237 | unsigned int i; 238 | 239 | if (height == 0) { 240 | /* This is the data block */ 241 | if (btype == PMFS_BLOCK_TYPE_4K) { 242 | set_bit(block >> PAGE_SHIFT, bm->bitmap_4k); 243 | } else if (btype == PMFS_BLOCK_TYPE_2M) { 244 | set_bit(block >> PAGE_SHIFT_2M, bm->bitmap_2M); 245 | } else { 246 | set_bit(block >> PAGE_SHIFT_1G, bm->bitmap_1G); 247 | } 248 | return; 249 | } 250 | 251 | node = pmfs_get_block(sb, block); 252 | set_bit(block >> PAGE_SHIFT, bm->bitmap_4k); 253 | for (i = 0; i < (1 << META_BLK_SHIFT); i++) { 254 | if (node[i] == 0) 255 | continue; 256 | pmfs_inode_crawl_recursive(sb, bm, 257 | le64_to_cpu(node[i]), height - 1, btype); 258 | } 259 | } 260 | 261 | static inline void pmfs_inode_crawl(struct super_block *sb, 262 | struct scan_bitmap *bm, struct pmfs_inode *pi) 263 | { 264 | if (pi->root == 0) 265 | return; 266 | pmfs_inode_crawl_recursive(sb, bm, le64_to_cpu(pi->root), pi->height, 267 | pi->i_blk_type); 268 | } 269 | 270 | static void pmfs_inode_table_crawl_recursive(struct super_block *sb, 271 | struct scan_bitmap *bm, unsigned long block, 272 | u32 height, u32 btype) 273 | { 274 | __le64 *node; 275 | unsigned int i; 276 | struct pmfs_inode *pi; 277 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 278 | 279 | node = pmfs_get_block(sb, block); 280 | 281 | if (height == 0) { 282 | unsigned int inodes_per_block = INODES_PER_BLOCK(btype); 283 | if (likely(btype == PMFS_BLOCK_TYPE_2M)) 284 | set_bit(block >> PAGE_SHIFT_2M, bm->bitmap_2M); 285 | else 286 | set_bit(block >> PAGE_SHIFT, bm->bitmap_4k); 287 | 288 | sbi->s_inodes_count += inodes_per_block; 289 | for (i = 0; i < inodes_per_block; i++) { 290 | pi = (struct pmfs_inode *)((void *)node + 291 | PMFS_INODE_SIZE * i); 292 | if (le16_to_cpu(pi->i_links_count) == 0 && 293 | (le16_to_cpu(pi->i_mode) == 0 || 294 | le32_to_cpu(pi->i_dtime))) { 295 | /* Empty inode */ 296 | continue; 297 | } 298 | sbi->s_inodes_used_count++; 299 | pmfs_inode_crawl(sb, bm, pi); 300 | } 301 | return; 302 | } 303 | 304 | set_bit(block >> PAGE_SHIFT, bm->bitmap_4k); 305 | for (i = 0; i < (1 << META_BLK_SHIFT); i++) { 306 | if (node[i] == 0) 307 | continue; 308 | pmfs_inode_table_crawl_recursive(sb, bm, 309 | le64_to_cpu(node[i]), height - 1, btype); 310 | } 311 | } 312 | 313 | static int pmfs_alloc_insert_blocknode_map(struct super_block *sb, 314 | unsigned long low, unsigned long high) 315 | { 316 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 317 | struct list_head *head = &(sbi->block_inuse_head); 318 | struct pmfs_blocknode *i, *next_i; 319 | struct pmfs_blocknode *free_blocknode= NULL; 320 | unsigned long num_blocks = 0; 321 | struct pmfs_blocknode *curr_node; 322 | int errval = 0; 323 | bool found = 0; 324 | unsigned long next_block_low; 325 | unsigned long new_block_low; 326 | unsigned long new_block_high; 327 | 328 | //num_blocks = pmfs_get_numblocks(btype); 329 | 330 | new_block_low = low; 331 | new_block_high = high; 332 | num_blocks = high - low + 1; 333 | 334 | list_for_each_entry(i, head, link) { 335 | if (i->link.next == head) { 336 | next_i = NULL; 337 | next_block_low = sbi->block_end; 338 | } else { 339 | next_i = list_entry(i->link.next, typeof(*i), link); 340 | next_block_low = next_i->block_low; 341 | } 342 | 343 | 344 | if (new_block_high >= next_block_low) { 345 | /* Does not fit - skip to next blocknode */ 346 | continue; 347 | } 348 | 349 | if ((new_block_low == (i->block_high + 1)) && 350 | (new_block_high == (next_block_low - 1))) 351 | { 352 | /* Fill the gap completely */ 353 | if (next_i) { 354 | i->block_high = next_i->block_high; 355 | list_del(&next_i->link); 356 | free_blocknode = next_i; 357 | } else { 358 | i->block_high = new_block_high; 359 | } 360 | found = 1; 361 | break; 362 | } 363 | 364 | if ((new_block_low == (i->block_high + 1)) && 365 | (new_block_high < (next_block_low - 1))) { 366 | /* Aligns to left */ 367 | i->block_high = new_block_high; 368 | found = 1; 369 | break; 370 | } 371 | 372 | if ((new_block_low > (i->block_high + 1)) && 373 | (new_block_high == (next_block_low - 1))) { 374 | /* Aligns to right */ 375 | if (next_i) { 376 | /* right node exist */ 377 | next_i->block_low = new_block_low; 378 | } else { 379 | /* right node does NOT exist */ 380 | curr_node = pmfs_alloc_blocknode(sb); 381 | PMFS_ASSERT(curr_node); 382 | if (curr_node == NULL) { 383 | errval = -ENOSPC; 384 | break; 385 | } 386 | curr_node->block_low = new_block_low; 387 | curr_node->block_high = new_block_high; 388 | list_add(&curr_node->link, &i->link); 389 | } 390 | found = 1; 391 | break; 392 | } 393 | 394 | if ((new_block_low > (i->block_high + 1)) && 395 | (new_block_high < (next_block_low - 1))) { 396 | /* Aligns somewhere in the middle */ 397 | curr_node = pmfs_alloc_blocknode(sb); 398 | PMFS_ASSERT(curr_node); 399 | if (curr_node == NULL) { 400 | errval = -ENOSPC; 401 | break; 402 | } 403 | curr_node->block_low = new_block_low; 404 | curr_node->block_high = new_block_high; 405 | list_add(&curr_node->link, &i->link); 406 | found = 1; 407 | break; 408 | } 409 | } 410 | 411 | if (found == 1) { 412 | sbi->num_free_blocks -= num_blocks; 413 | } 414 | 415 | if (free_blocknode) 416 | pmfs_free_blocknode(sb, free_blocknode); 417 | 418 | if (found == 0) { 419 | return -ENOSPC; 420 | } 421 | 422 | 423 | return errval; 424 | } 425 | 426 | static int __pmfs_build_blocknode_map(struct super_block *sb, 427 | unsigned long *bitmap, unsigned long bsize, unsigned long scale) 428 | { 429 | unsigned long next = 1; 430 | unsigned long low = 0; 431 | 432 | while (1) { 433 | next = find_next_bit(bitmap, bsize, next); 434 | if (next == bsize) 435 | break; 436 | low = next; 437 | next = find_next_zero_bit(bitmap, bsize, next); 438 | if (pmfs_alloc_insert_blocknode_map(sb, low << scale , 439 | (next << scale) - 1)) { 440 | printk("PMFS: Error could not insert 0x%lx-0x%lx\n", 441 | low << scale, ((next << scale) - 1)); 442 | } 443 | if (next == bsize) 444 | break; 445 | } 446 | return 0; 447 | } 448 | 449 | static void pmfs_build_blocknode_map(struct super_block *sb, 450 | struct scan_bitmap *bm) 451 | { 452 | __pmfs_build_blocknode_map(sb, bm->bitmap_4k, bm->bitmap_4k_size * 8, 453 | PAGE_SHIFT - 12); 454 | __pmfs_build_blocknode_map(sb, bm->bitmap_2M, bm->bitmap_2M_size * 8, 455 | PAGE_SHIFT_2M - 12); 456 | __pmfs_build_blocknode_map(sb, bm->bitmap_1G, bm->bitmap_1G_size * 8, 457 | PAGE_SHIFT_1G - 12); 458 | } 459 | 460 | int pmfs_setup_blocknode_map(struct super_block *sb) 461 | { 462 | struct pmfs_super_block *super = pmfs_get_super(sb); 463 | struct pmfs_inode *pi = pmfs_get_inode_table(sb); 464 | pmfs_journal_t *journal = pmfs_get_journal(sb); 465 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 466 | struct scan_bitmap bm; 467 | unsigned long initsize = le64_to_cpu(super->s_size); 468 | bool value = false; 469 | timing_t start, end; 470 | 471 | /* Always check recovery time */ 472 | if (measure_timing == 0) 473 | getrawmonotonic(&start); 474 | 475 | PMFS_START_TIMING(recovery_t, start); 476 | 477 | mutex_init(&sbi->inode_table_mutex); 478 | sbi->block_start = (unsigned long)0; 479 | sbi->block_end = ((unsigned long)(initsize) >> PAGE_SHIFT); 480 | 481 | value = pmfs_can_skip_full_scan(sb); 482 | if (value) { 483 | pmfs_dbg_verbose("PMFS: Skipping full scan of inodes...\n"); 484 | goto end; 485 | } 486 | 487 | pmfs_dbg("PMFS: Performing failure recovery\n"); 488 | bm.bitmap_4k_size = (initsize >> (PAGE_SHIFT + 0x3)) + 1; 489 | bm.bitmap_2M_size = (initsize >> (PAGE_SHIFT_2M + 0x3)) + 1; 490 | bm.bitmap_1G_size = (initsize >> (PAGE_SHIFT_1G + 0x3)) + 1; 491 | 492 | /* Alloc memory to hold the block alloc bitmap */ 493 | bm.bitmap_4k = kzalloc(bm.bitmap_4k_size, GFP_KERNEL); 494 | bm.bitmap_2M = kzalloc(bm.bitmap_2M_size, GFP_KERNEL); 495 | bm.bitmap_1G = kzalloc(bm.bitmap_1G_size, GFP_KERNEL); 496 | 497 | if (!bm.bitmap_4k || !bm.bitmap_2M || !bm.bitmap_1G) 498 | goto skip; 499 | 500 | /* Clearing the datablock inode */ 501 | pmfs_clear_datablock_inode(sb); 502 | 503 | pmfs_inode_table_crawl_recursive(sb, &bm, le64_to_cpu(pi->root), 504 | pi->height, pi->i_blk_type); 505 | 506 | /* Reserving tow inodes - Inode 0 and Inode for datablock */ 507 | sbi->s_free_inodes_count = sbi->s_inodes_count - 508 | (sbi->s_inodes_used_count + 2); 509 | 510 | /* set the block 0 as this is used */ 511 | sbi->s_free_inode_hint = PMFS_FREE_INODE_HINT_START; 512 | 513 | /* initialize the num_free_blocks to */ 514 | sbi->num_free_blocks = ((unsigned long)(initsize) >> PAGE_SHIFT); 515 | pmfs_init_blockmap(sb, le64_to_cpu(journal->base) + sbi->jsize); 516 | 517 | pmfs_build_blocknode_map(sb, &bm); 518 | 519 | skip: 520 | 521 | kfree(bm.bitmap_4k); 522 | kfree(bm.bitmap_2M); 523 | kfree(bm.bitmap_1G); 524 | 525 | end: 526 | PMFS_END_TIMING(recovery_t, start); 527 | if (measure_timing == 0) { 528 | getrawmonotonic(&end); 529 | Timingstats[recovery_t] += 530 | (end.tv_sec - start.tv_sec) * 1000000000 + 531 | (end.tv_nsec - start.tv_nsec); 532 | } 533 | 534 | return 0; 535 | } 536 | -------------------------------------------------------------------------------- /xip.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * XIP operations. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * This file is licensed under the terms of the GNU General Public 9 | * License version 2. This program is licensed "as is" without any 10 | * warranty of any kind, whether express or implied. 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "pmfs.h" 20 | #include "xip.h" 21 | 22 | static ssize_t 23 | do_xip_mapping_read(struct address_space *mapping, 24 | struct file_ra_state *_ra, 25 | struct file *filp, 26 | char __user *buf, 27 | size_t len, 28 | loff_t *ppos) 29 | { 30 | struct inode *inode = mapping->host; 31 | pgoff_t index, end_index; 32 | unsigned long offset; 33 | loff_t isize, pos; 34 | size_t copied = 0, error = 0; 35 | timing_t memcpy_time; 36 | 37 | pos = *ppos; 38 | index = pos >> PAGE_SHIFT; 39 | offset = pos & ~PAGE_MASK; 40 | 41 | isize = i_size_read(inode); 42 | if (!isize) 43 | goto out; 44 | 45 | end_index = (isize - 1) >> PAGE_SHIFT; 46 | do { 47 | unsigned long nr, left; 48 | void *xip_mem; 49 | unsigned long xip_pfn; 50 | int zero = 0; 51 | 52 | /* nr is the maximum number of bytes to copy from this page */ 53 | nr = PAGE_SIZE; 54 | if (index >= end_index) { 55 | if (index > end_index) 56 | goto out; 57 | nr = ((isize - 1) & ~PAGE_MASK) + 1; 58 | if (nr <= offset) { 59 | goto out; 60 | } 61 | } 62 | nr = nr - offset; 63 | if (nr > len - copied) 64 | nr = len - copied; 65 | 66 | error = pmfs_get_xip_mem(mapping, index, 0, 67 | &xip_mem, &xip_pfn); 68 | if (unlikely(error)) { 69 | if (error == -ENODATA) { 70 | /* sparse */ 71 | zero = 1; 72 | } else 73 | goto out; 74 | } 75 | 76 | /* If users can be writing to this page using arbitrary 77 | * virtual addresses, take care about potential aliasing 78 | * before reading the page on the kernel side. 79 | */ 80 | if (mapping_writably_mapped(mapping)) 81 | /* address based flush */ ; 82 | 83 | /* 84 | * Ok, we have the mem, so now we can copy it to user space... 85 | * 86 | * The actor routine returns how many bytes were actually used.. 87 | * NOTE! This may not be the same as how much of a user buffer 88 | * we filled up (we may be padding etc), so we can only update 89 | * "pos" here (the actor routine has to update the user buffer 90 | * pointers and the remaining count). 91 | */ 92 | PMFS_START_TIMING(memcpy_r_t, memcpy_time); 93 | if (!zero) 94 | left = __copy_to_user(buf+copied, xip_mem+offset, nr); 95 | else 96 | left = __clear_user(buf + copied, nr); 97 | PMFS_END_TIMING(memcpy_r_t, memcpy_time); 98 | 99 | if (left) { 100 | error = -EFAULT; 101 | goto out; 102 | } 103 | 104 | copied += (nr - left); 105 | offset += (nr - left); 106 | index += offset >> PAGE_SHIFT; 107 | offset &= ~PAGE_MASK; 108 | } while (copied < len); 109 | 110 | out: 111 | *ppos = pos + copied; 112 | if (filp) 113 | file_accessed(filp); 114 | 115 | return (copied ? copied : error); 116 | } 117 | 118 | ssize_t 119 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 120 | { 121 | if (!access_ok(VERIFY_WRITE, buf, len)) 122 | return -EFAULT; 123 | 124 | return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, 125 | buf, len, ppos); 126 | } 127 | 128 | /* 129 | * Wrappers. We need to use the rcu read lock to avoid 130 | * concurrent truncate operation. No problem for write because we held 131 | * i_mutex. 132 | */ 133 | ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf, 134 | size_t len, loff_t *ppos) 135 | { 136 | ssize_t res; 137 | timing_t xip_read_time; 138 | 139 | PMFS_START_TIMING(xip_read_t, xip_read_time); 140 | // rcu_read_lock(); 141 | res = xip_file_read(filp, buf, len, ppos); 142 | // rcu_read_unlock(); 143 | PMFS_END_TIMING(xip_read_t, xip_read_time); 144 | return res; 145 | } 146 | 147 | static inline void pmfs_flush_edge_cachelines(loff_t pos, ssize_t len, 148 | void *start_addr) 149 | { 150 | if (unlikely(pos & 0x7)) 151 | pmfs_flush_buffer(start_addr, 1, false); 152 | if (unlikely(((pos + len) & 0x7) && ((pos & (CACHELINE_SIZE - 1)) != 153 | ((pos + len) & (CACHELINE_SIZE - 1))))) 154 | pmfs_flush_buffer(start_addr + len, 1, false); 155 | } 156 | 157 | static inline size_t memcpy_to_nvmm(char *kmem, loff_t offset, 158 | const char __user *buf, size_t bytes) 159 | { 160 | size_t copied; 161 | 162 | if (support_clwb) { 163 | copied = bytes - __copy_from_user(kmem + offset, buf, bytes); 164 | pmfs_flush_buffer(kmem + offset, copied, 0); 165 | } else { 166 | copied = bytes - __copy_from_user_inatomic_nocache(kmem + 167 | offset, buf, bytes); 168 | } 169 | 170 | return copied; 171 | } 172 | 173 | static ssize_t 174 | __pmfs_xip_file_write(struct address_space *mapping, const char __user *buf, 175 | size_t count, loff_t pos, loff_t *ppos) 176 | { 177 | struct inode *inode = mapping->host; 178 | struct super_block *sb = inode->i_sb; 179 | long status = 0; 180 | size_t bytes; 181 | ssize_t written = 0; 182 | struct pmfs_inode *pi; 183 | timing_t memcpy_time, write_time; 184 | 185 | PMFS_START_TIMING(internal_write_t, write_time); 186 | pi = pmfs_get_inode(sb, inode->i_ino); 187 | do { 188 | unsigned long index; 189 | unsigned long offset; 190 | size_t copied; 191 | void *xmem; 192 | unsigned long xpfn; 193 | 194 | offset = (pos & (sb->s_blocksize - 1)); /* Within page */ 195 | index = pos >> sb->s_blocksize_bits; 196 | bytes = sb->s_blocksize - offset; 197 | if (bytes > count) 198 | bytes = count; 199 | 200 | status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn); 201 | if (status) 202 | break; 203 | 204 | PMFS_START_TIMING(memcpy_w_t, memcpy_time); 205 | pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1); 206 | copied = memcpy_to_nvmm((char *)xmem, offset, buf, bytes); 207 | pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0); 208 | PMFS_END_TIMING(memcpy_w_t, memcpy_time); 209 | 210 | /* if start or end dest address is not 8 byte aligned, 211 | * __copy_from_user_inatomic_nocache uses cacheable instructions 212 | * (instead of movnti) to write. So flush those cachelines. */ 213 | pmfs_flush_edge_cachelines(pos, copied, xmem + offset); 214 | 215 | if (likely(copied > 0)) { 216 | status = copied; 217 | 218 | if (status >= 0) { 219 | written += status; 220 | count -= status; 221 | pos += status; 222 | buf += status; 223 | } 224 | } 225 | if (unlikely(copied != bytes)) 226 | if (status >= 0) 227 | status = -EFAULT; 228 | if (status < 0) 229 | break; 230 | } while (count); 231 | *ppos = pos; 232 | /* 233 | * No need to use i_size_read() here, the i_size 234 | * cannot change under us because we hold i_mutex. 235 | */ 236 | if (pos > inode->i_size) { 237 | i_size_write(inode, pos); 238 | pmfs_update_isize(inode, pi); 239 | } 240 | 241 | PMFS_END_TIMING(internal_write_t, write_time); 242 | return written ? written : status; 243 | } 244 | 245 | /* optimized path for file write that doesn't require a transaction. In this 246 | * path we don't need to allocate any new data blocks. So the only meta-data 247 | * modified in path is inode's i_size, i_ctime, and i_mtime fields */ 248 | static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode, 249 | struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos, 250 | loff_t *ppos, u64 block) 251 | { 252 | void *xmem = pmfs_get_block(sb, block); 253 | size_t copied, ret = 0, offset; 254 | timing_t memcpy_time; 255 | 256 | offset = pos & (sb->s_blocksize - 1); 257 | 258 | PMFS_START_TIMING(memcpy_w_t, memcpy_time); 259 | pmfs_xip_mem_protect(sb, xmem + offset, count, 1); 260 | copied = memcpy_to_nvmm((char *)xmem, offset, buf, count); 261 | pmfs_xip_mem_protect(sb, xmem + offset, count, 0); 262 | PMFS_END_TIMING(memcpy_w_t, memcpy_time); 263 | 264 | pmfs_flush_edge_cachelines(pos, copied, xmem + offset); 265 | 266 | if (likely(copied > 0)) { 267 | pos += copied; 268 | ret = copied; 269 | } 270 | if (unlikely(copied != count && copied == 0)) 271 | ret = -EFAULT; 272 | *ppos = pos; 273 | inode->i_ctime = inode->i_mtime = current_time(inode); 274 | if (pos > inode->i_size) { 275 | /* make sure written data is persistent before updating 276 | * time and size */ 277 | PERSISTENT_MARK(); 278 | i_size_write(inode, pos); 279 | PERSISTENT_BARRIER(); 280 | pmfs_memunlock_inode(sb, pi); 281 | pmfs_update_time_and_size(inode, pi); 282 | pmfs_memlock_inode(sb, pi); 283 | } else { 284 | u64 c_m_time; 285 | /* update c_time and m_time atomically. We don't need to make the data 286 | * persistent because the expectation is that the close() or an explicit 287 | * fsync will do that. */ 288 | c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF); 289 | c_m_time = c_m_time | (c_m_time << 32); 290 | pmfs_memunlock_inode(sb, pi); 291 | pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8); 292 | pmfs_memlock_inode(sb, pi); 293 | } 294 | pmfs_flush_buffer(pi, 1, false); 295 | return ret; 296 | } 297 | 298 | /* 299 | * blk_off is used in different ways depending on whether the edge block is 300 | * at the beginning or end of the write. If it is at the beginning, we zero from 301 | * start-of-block to 'blk_off'. If it is the end block, we zero from 'blk_off' to 302 | * end-of-block 303 | */ 304 | static inline void pmfs_clear_edge_blk (struct super_block *sb, struct 305 | pmfs_inode *pi, bool new_blk, unsigned long block, size_t blk_off, 306 | bool is_end_blk) 307 | { 308 | void *ptr; 309 | size_t count; 310 | unsigned long blknr; 311 | 312 | if (new_blk) { 313 | blknr = block >> (pmfs_inode_blk_shift(pi) - 314 | sb->s_blocksize_bits); 315 | ptr = pmfs_get_block(sb, __pmfs_find_data_block(sb, pi, blknr)); 316 | if (ptr != NULL) { 317 | if (is_end_blk) { 318 | ptr = ptr + blk_off - (blk_off % 8); 319 | count = pmfs_inode_blk_size(pi) - 320 | blk_off + (blk_off % 8); 321 | } else 322 | count = blk_off + (8 - (blk_off % 8)); 323 | pmfs_memunlock_range(sb, ptr, pmfs_inode_blk_size(pi)); 324 | memset_nt(ptr, 0, count); 325 | pmfs_memlock_range(sb, ptr, pmfs_inode_blk_size(pi)); 326 | } 327 | } 328 | } 329 | 330 | ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf, 331 | size_t len, loff_t *ppos) 332 | { 333 | struct address_space *mapping = filp->f_mapping; 334 | struct inode *inode = mapping->host; 335 | struct super_block *sb = inode->i_sb; 336 | pmfs_transaction_t *trans; 337 | struct pmfs_inode *pi; 338 | ssize_t written = 0; 339 | loff_t pos; 340 | u64 block; 341 | bool new_sblk = false, new_eblk = false; 342 | size_t count, offset, eblk_offset, ret; 343 | unsigned long start_blk, end_blk, num_blocks, max_logentries; 344 | bool same_block; 345 | timing_t xip_write_time, xip_write_fast_time; 346 | 347 | PMFS_START_TIMING(xip_write_t, xip_write_time); 348 | 349 | sb_start_write(inode->i_sb); 350 | inode_lock(inode); 351 | 352 | if (!access_ok(VERIFY_READ, buf, len)) { 353 | ret = -EFAULT; 354 | goto out; 355 | } 356 | pos = *ppos; 357 | count = len; 358 | if (count == 0) { 359 | ret = 0; 360 | goto out; 361 | } 362 | 363 | pi = pmfs_get_inode(sb, inode->i_ino); 364 | 365 | offset = pos & (sb->s_blocksize - 1); 366 | num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1; 367 | /* offset in the actual block size block */ 368 | offset = pos & (pmfs_inode_blk_size(pi) - 1); 369 | start_blk = pos >> sb->s_blocksize_bits; 370 | end_blk = start_blk + num_blocks - 1; 371 | 372 | block = pmfs_find_data_block(inode, start_blk); 373 | 374 | /* Referring to the inode's block size, not 4K */ 375 | same_block = (((count + offset - 1) >> 376 | pmfs_inode_blk_shift(pi)) == 0) ? 1 : 0; 377 | if (block && same_block) { 378 | PMFS_START_TIMING(xip_write_fast_t, xip_write_fast_time); 379 | ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos, 380 | ppos, block); 381 | PMFS_END_TIMING(xip_write_fast_t, xip_write_fast_time); 382 | goto out; 383 | } 384 | max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2; 385 | if (max_logentries > MAX_METABLOCK_LENTRIES) 386 | max_logentries = MAX_METABLOCK_LENTRIES; 387 | 388 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries); 389 | if (IS_ERR(trans)) { 390 | ret = PTR_ERR(trans); 391 | goto out; 392 | } 393 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 394 | 395 | ret = file_remove_privs(filp); 396 | if (ret) { 397 | pmfs_abort_transaction(sb, trans); 398 | goto out; 399 | } 400 | inode->i_ctime = inode->i_mtime = current_time(inode); 401 | pmfs_update_time(inode, pi); 402 | 403 | /* We avoid zeroing the alloc'd range, which is going to be overwritten 404 | * by this system call anyway */ 405 | if (offset != 0) { 406 | if (pmfs_find_data_block(inode, start_blk) == 0) 407 | new_sblk = true; 408 | } 409 | 410 | eblk_offset = (pos + count) & (pmfs_inode_blk_size(pi) - 1); 411 | if ((eblk_offset != 0) && 412 | (pmfs_find_data_block(inode, end_blk) == 0)) 413 | new_eblk = true; 414 | 415 | /* don't zero-out the allocated blocks */ 416 | pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false); 417 | 418 | /* now zero out the edge blocks which will be partially written */ 419 | pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk, offset, false); 420 | pmfs_clear_edge_blk(sb, pi, new_eblk, end_blk, eblk_offset, true); 421 | 422 | written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos); 423 | if (written < 0 || written != count) 424 | pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld" 425 | " pos %llx start_blk %lx num_blocks %lx\n", 426 | written, count, pos, start_blk, num_blocks); 427 | 428 | pmfs_commit_transaction(sb, trans); 429 | ret = written; 430 | out: 431 | inode_unlock(inode); 432 | sb_end_write(inode->i_sb); 433 | PMFS_END_TIMING(xip_write_t, xip_write_time); 434 | return ret; 435 | } 436 | 437 | /* OOM err return with xip file fault handlers doesn't mean anything. 438 | * It would just cause the OS to go an unnecessary killing spree ! 439 | */ 440 | static int __pmfs_xip_file_fault(struct vm_area_struct *vma, 441 | struct vm_fault *vmf) 442 | { 443 | struct address_space *mapping = vma->vm_file->f_mapping; 444 | struct inode *inode = mapping->host; 445 | pgoff_t size; 446 | void *xip_mem; 447 | unsigned long xip_pfn; 448 | int err; 449 | 450 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 451 | if (vmf->pgoff >= size) { 452 | pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx)," 453 | " vm_end(0x%lx), pgoff(0x%lx), VA(%lx), size 0x%lx\n", 454 | __func__, __LINE__, vma->vm_start, vma->vm_end, 455 | vmf->pgoff, (unsigned long)vmf->address, size); 456 | return VM_FAULT_SIGBUS; 457 | } 458 | 459 | err = pmfs_get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn); 460 | if (unlikely(err)) { 461 | pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x%lx)," 462 | " vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n", 463 | __func__, __LINE__, vma->vm_start, vma->vm_end, 464 | vmf->pgoff, (unsigned long)vmf->address); 465 | return VM_FAULT_SIGBUS; 466 | } 467 | 468 | pmfs_dbg_mmapv("[%s:%d] vm_start(0x%lx), vm_end(0x%lx), pgoff(0x%lx), " 469 | "BlockSz(0x%lx), VA(0x%lx)->PA(0x%lx)\n", __func__, 470 | __LINE__, vma->vm_start, vma->vm_end, vmf->pgoff, 471 | PAGE_SIZE, (unsigned long)vmf->address, 472 | (unsigned long)xip_pfn << PAGE_SHIFT); 473 | 474 | err = vm_insert_mixed(vma, (unsigned long)vmf->address, 475 | pfn_to_pfn_t(xip_pfn)); 476 | 477 | if (err == -ENOMEM) 478 | return VM_FAULT_SIGBUS; 479 | /* 480 | * err == -EBUSY is fine, we've raced against another thread 481 | * that faulted-in the same page 482 | */ 483 | if (err != -EBUSY) 484 | BUG_ON(err); 485 | return VM_FAULT_NOPAGE; 486 | } 487 | 488 | static int pmfs_xip_file_fault(struct vm_fault *vmf) 489 | { 490 | int ret = 0; 491 | timing_t fault_time; 492 | 493 | PMFS_START_TIMING(mmap_fault_t, fault_time); 494 | rcu_read_lock(); 495 | ret = __pmfs_xip_file_fault(vmf->vma, vmf); 496 | rcu_read_unlock(); 497 | PMFS_END_TIMING(mmap_fault_t, fault_time); 498 | return ret; 499 | } 500 | 501 | static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock, 502 | sector_t *data_block, int create) 503 | { 504 | int err = -EIO; 505 | u64 block; 506 | pmfs_transaction_t *trans; 507 | struct pmfs_inode *pi; 508 | 509 | block = pmfs_find_data_block(inode, iblock); 510 | 511 | if (!block) { 512 | struct super_block *sb = inode->i_sb; 513 | if (!create) { 514 | err = -ENODATA; 515 | goto err; 516 | } 517 | 518 | pi = pmfs_get_inode(sb, inode->i_ino); 519 | trans = pmfs_current_transaction(); 520 | if (trans) { 521 | err = pmfs_alloc_blocks(trans, inode, iblock, 1, true); 522 | if (err) { 523 | pmfs_dbg_verbose("[%s:%d] Alloc failed!\n", 524 | __func__, __LINE__); 525 | goto err; 526 | } 527 | } else { 528 | /* 1 lentry for inode, 1 lentry for inode's b-tree */ 529 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES); 530 | if (IS_ERR(trans)) { 531 | err = PTR_ERR(trans); 532 | goto err; 533 | } 534 | 535 | rcu_read_unlock(); 536 | inode_lock(inode); 537 | 538 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, 539 | LE_DATA); 540 | err = pmfs_alloc_blocks(trans, inode, iblock, 1, true); 541 | 542 | pmfs_commit_transaction(sb, trans); 543 | 544 | inode_unlock(inode); 545 | rcu_read_lock(); 546 | if (err) { 547 | pmfs_dbg_verbose("[%s:%d] Alloc failed!\n", 548 | __func__, __LINE__); 549 | goto err; 550 | } 551 | } 552 | block = pmfs_find_data_block(inode, iblock); 553 | if (!block) { 554 | pmfs_dbg("[%s:%d] But alloc didn't fail!\n", 555 | __func__, __LINE__); 556 | err = -ENODATA; 557 | goto err; 558 | } 559 | } 560 | pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock, 561 | block); 562 | 563 | *data_block = block; 564 | err = 0; 565 | 566 | err: 567 | return err; 568 | } 569 | 570 | static inline int __pmfs_get_block(struct inode *inode, pgoff_t pgoff, 571 | int create, sector_t *result) 572 | { 573 | int rc = 0; 574 | 575 | rc = pmfs_find_and_alloc_blocks(inode, (sector_t)pgoff, result, 576 | create); 577 | return rc; 578 | } 579 | 580 | int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, 581 | void **kmem, unsigned long *pfn) 582 | { 583 | int rc; 584 | sector_t block = 0; 585 | struct inode *inode = mapping->host; 586 | 587 | rc = __pmfs_get_block(inode, pgoff, create, &block); 588 | if (rc) { 589 | pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx)," 590 | " pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, 591 | __LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr, 592 | block, pgoff, create, *pfn); 593 | return rc; 594 | } 595 | 596 | *kmem = pmfs_get_block(inode->i_sb, block); 597 | *pfn = pmfs_get_pfn(inode->i_sb, block); 598 | 599 | pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx)," 600 | " pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__, 601 | PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn); 602 | return 0; 603 | } 604 | 605 | static const struct vm_operations_struct pmfs_xip_vm_ops = { 606 | .fault = pmfs_xip_file_fault, 607 | }; 608 | 609 | int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma) 610 | { 611 | // BUG_ON(!file->f_mapping->a_ops->get_xip_mem); 612 | 613 | file_accessed(file); 614 | 615 | vma->vm_flags |= VM_MIXEDMAP; 616 | 617 | vma->vm_ops = &pmfs_xip_vm_ops; 618 | pmfs_dbg_mmap4k("[%s:%d] MMAP 4KPAGE vm_start(0x%lx)," 619 | " vm_end(0x%lx), vm_flags(0x%lx), " 620 | "vm_page_prot(0x%lx)\n", __func__, 621 | __LINE__, vma->vm_start, vma->vm_end, 622 | vma->vm_flags, pgprot_val(vma->vm_page_prot)); 623 | 624 | return 0; 625 | } 626 | -------------------------------------------------------------------------------- /pmfs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Definitions for the PMFS filesystem. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | #ifndef __PMFS_H 16 | #define __PMFS_H 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "pmfs_def.h" 28 | #include "journal.h" 29 | 30 | #define PAGE_SHIFT_2M 21 31 | #define PAGE_SHIFT_1G 30 32 | 33 | #define PMFS_ASSERT(x) \ 34 | if (!(x)) { \ 35 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ 36 | __FILE__, __LINE__, #x); \ 37 | } 38 | 39 | /* 40 | * Debug code 41 | */ 42 | #ifdef pr_fmt 43 | #undef pr_fmt 44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 45 | #endif 46 | 47 | /* #define pmfs_dbg(s, args...) pr_debug(s, ## args) */ 48 | #define pmfs_dbg(s, args ...) pr_info(s, ## args) 49 | #define pmfs_dbg1(s, args ...) 50 | #define pmfs_err(sb, s, args ...) pmfs_error_mng(sb, s, ## args) 51 | #define pmfs_warn(s, args ...) pr_warning(s, ## args) 52 | #define pmfs_info(s, args ...) pr_info(s, ## args) 53 | 54 | extern unsigned int pmfs_dbgmask; 55 | #define PMFS_DBGMASK_MMAPHUGE (0x00000001) 56 | #define PMFS_DBGMASK_MMAP4K (0x00000002) 57 | #define PMFS_DBGMASK_MMAPVERBOSE (0x00000004) 58 | #define PMFS_DBGMASK_MMAPVVERBOSE (0x00000008) 59 | #define PMFS_DBGMASK_VERBOSE (0x00000010) 60 | #define PMFS_DBGMASK_TRANSACTION (0x00000020) 61 | 62 | #define pmfs_dbg_mmaphuge(s, args ...) \ 63 | ((pmfs_dbgmask & PMFS_DBGMASK_MMAPHUGE) ? pmfs_dbg(s, args) : 0) 64 | #define pmfs_dbg_mmap4k(s, args ...) \ 65 | ((pmfs_dbgmask & PMFS_DBGMASK_MMAP4K) ? pmfs_dbg(s, args) : 0) 66 | #define pmfs_dbg_mmapv(s, args ...) \ 67 | ((pmfs_dbgmask & PMFS_DBGMASK_MMAPVERBOSE) ? pmfs_dbg(s, args) : 0) 68 | #define pmfs_dbg_mmapvv(s, args ...) \ 69 | ((pmfs_dbgmask & PMFS_DBGMASK_MMAPVVERBOSE) ? pmfs_dbg(s, args) : 0) 70 | 71 | #define pmfs_dbg_verbose(s, args ...) \ 72 | ((pmfs_dbgmask & PMFS_DBGMASK_VERBOSE) ? pmfs_dbg(s, ##args) : 0) 73 | #define pmfs_dbg_trans(s, args ...) \ 74 | ((pmfs_dbgmask & PMFS_DBGMASK_TRANSACTION) ? pmfs_dbg(s, ##args) : 0) 75 | 76 | #define pmfs_set_bit __test_and_set_bit_le 77 | #define pmfs_clear_bit __test_and_clear_bit_le 78 | #define pmfs_find_next_zero_bit find_next_zero_bit_le 79 | 80 | #define clear_opt(o, opt) (o &= ~PMFS_MOUNT_ ## opt) 81 | #define set_opt(o, opt) (o |= PMFS_MOUNT_ ## opt) 82 | #define test_opt(sb, opt) (PMFS_SB(sb)->s_mount_opt & PMFS_MOUNT_ ## opt) 83 | 84 | #define PMFS_LARGE_INODE_TABLE_SIZE (0x200000) 85 | /* PMFS size threshold for using 2M blocks for inode table */ 86 | #define PMFS_LARGE_INODE_TABLE_THREASHOLD (0x20000000) 87 | /* 88 | * pmfs inode flags 89 | * 90 | * PMFS_EOFBLOCKS_FL There are blocks allocated beyond eof 91 | */ 92 | #define PMFS_EOFBLOCKS_FL 0x20000000 93 | /* Flags that should be inherited by new inodes from their parent. */ 94 | #define PMFS_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \ 95 | FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \ 96 | FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_JOURNAL_DATA_FL | \ 97 | FS_NOTAIL_FL | FS_DIRSYNC_FL) 98 | /* Flags that are appropriate for regular files (all but dir-specific ones). */ 99 | #define PMFS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) 100 | /* Flags that are appropriate for non-directories/regular files. */ 101 | #define PMFS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) 102 | #define PMFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | PMFS_EOFBLOCKS_FL) 103 | 104 | #define INODES_PER_BLOCK(bt) (1 << (blk_type_to_shift[bt] - PMFS_INODE_BITS)) 105 | 106 | extern unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX]; 107 | extern unsigned int blk_type_to_size[PMFS_BLOCK_TYPE_MAX]; 108 | 109 | /* ======================= Timing ========================= */ 110 | enum timing_category { 111 | create_t, 112 | unlink_t, 113 | readdir_t, 114 | xip_read_t, 115 | xip_write_t, 116 | xip_write_fast_t, 117 | internal_write_t, 118 | memcpy_r_t, 119 | memcpy_w_t, 120 | alloc_blocks_t, 121 | new_trans_t, 122 | add_log_t, 123 | commit_trans_t, 124 | mmap_fault_t, 125 | fsync_t, 126 | free_tree_t, 127 | evict_inode_t, 128 | recovery_t, 129 | TIMING_NUM, 130 | }; 131 | 132 | extern const char *Timingstring[TIMING_NUM]; 133 | extern unsigned long long Timingstats[TIMING_NUM]; 134 | extern u64 Countstats[TIMING_NUM]; 135 | 136 | extern int measure_timing; 137 | extern int support_clwb; 138 | 139 | extern atomic64_t fsync_pages; 140 | 141 | typedef struct timespec timing_t; 142 | 143 | #define PMFS_START_TIMING(name, start) \ 144 | {if (measure_timing) getrawmonotonic(&start);} 145 | 146 | #define PMFS_END_TIMING(name, start) \ 147 | {if (measure_timing) { \ 148 | timing_t end; \ 149 | getrawmonotonic(&end); \ 150 | Timingstats[name] += \ 151 | (end.tv_sec - start.tv_sec) * 1000000000 + \ 152 | (end.tv_nsec - start.tv_nsec); \ 153 | } \ 154 | Countstats[name]++; \ 155 | } 156 | 157 | /* Function Prototypes */ 158 | extern void pmfs_error_mng(struct super_block *sb, const char *fmt, ...); 159 | 160 | /* file.c */ 161 | extern int pmfs_mmap(struct file *file, struct vm_area_struct *vma); 162 | 163 | /* balloc.c */ 164 | int pmfs_setup_blocknode_map(struct super_block *sb); 165 | extern struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb); 166 | extern void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode); 167 | extern void pmfs_init_blockmap(struct super_block *sb, 168 | unsigned long init_used_size); 169 | extern void pmfs_free_block(struct super_block *sb, unsigned long blocknr, 170 | unsigned short btype); 171 | extern void __pmfs_free_block(struct super_block *sb, unsigned long blocknr, 172 | unsigned short btype, struct pmfs_blocknode **start_hint); 173 | extern int pmfs_new_block(struct super_block *sb, unsigned long *blocknr, 174 | unsigned short btype, int zero); 175 | extern unsigned long pmfs_count_free_blocks(struct super_block *sb); 176 | 177 | /* dir.c */ 178 | extern int pmfs_add_entry(pmfs_transaction_t *trans, 179 | struct dentry *dentry, struct inode *inode); 180 | extern int pmfs_remove_entry(pmfs_transaction_t *trans, 181 | struct dentry *dentry, struct inode *inode); 182 | 183 | /* namei.c */ 184 | extern struct dentry *pmfs_get_parent(struct dentry *child); 185 | 186 | /* inode.c */ 187 | extern unsigned int pmfs_free_inode_subtree(struct super_block *sb, 188 | __le64 root, u32 height, u32 btype, unsigned long last_blocknr); 189 | extern int __pmfs_alloc_blocks(pmfs_transaction_t *trans, 190 | struct super_block *sb, struct pmfs_inode *pi, 191 | unsigned long file_blocknr, unsigned int num, bool zero); 192 | extern int pmfs_init_inode_table(struct super_block *sb); 193 | extern int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode, 194 | unsigned long file_blocknr, unsigned int num, bool zero); 195 | extern u64 pmfs_find_data_block(struct inode *inode, 196 | unsigned long file_blocknr); 197 | int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi, 198 | loff_t new_size); 199 | void pmfs_setsize(struct inode *inode, loff_t newsize); 200 | 201 | extern struct inode *pmfs_iget(struct super_block *sb, unsigned long ino); 202 | extern void pmfs_put_inode(struct inode *inode); 203 | extern void pmfs_evict_inode(struct inode *inode); 204 | extern struct inode *pmfs_new_inode(pmfs_transaction_t *trans, 205 | struct inode *dir, umode_t mode, const struct qstr *qstr); 206 | extern void pmfs_update_isize(struct inode *inode, struct pmfs_inode *pi); 207 | extern void pmfs_update_nlink(struct inode *inode, struct pmfs_inode *pi); 208 | extern void pmfs_update_time(struct inode *inode, struct pmfs_inode *pi); 209 | extern int pmfs_write_inode(struct inode *inode, struct writeback_control *wbc); 210 | extern void pmfs_dirty_inode(struct inode *inode, int flags); 211 | extern int pmfs_notify_change(struct dentry *dentry, struct iattr *attr); 212 | int pmfs_getattr(const struct path *path, struct kstat *stat, 213 | u32 request_mask, unsigned int flags); 214 | extern void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi); 215 | extern void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi); 216 | extern unsigned long pmfs_find_region(struct inode *inode, loff_t *offset, 217 | int hole); 218 | extern void pmfs_truncate_del(struct inode *inode); 219 | extern void pmfs_truncate_add(struct inode *inode, u64 truncate_size); 220 | 221 | /* ioctl.c */ 222 | extern long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 223 | #ifdef CONFIG_COMPAT 224 | extern long pmfs_compat_ioctl(struct file *file, unsigned int cmd, 225 | unsigned long arg); 226 | #endif 227 | 228 | /* super.c */ 229 | #ifdef CONFIG_PMFS_TEST 230 | extern struct pmfs_super_block *get_pmfs_super(void); 231 | #endif 232 | extern void __pmfs_free_blocknode(struct pmfs_blocknode *bnode); 233 | extern struct super_block *pmfs_read_super(struct super_block *sb, void *data, 234 | int silent); 235 | extern int pmfs_statfs(struct dentry *d, struct kstatfs *buf); 236 | extern int pmfs_remount(struct super_block *sb, int *flags, char *data); 237 | 238 | /* symlink.c */ 239 | extern int pmfs_block_symlink(struct inode *inode, const char *symname, 240 | int len); 241 | 242 | /* Inline functions start here */ 243 | 244 | /* Mask out flags that are inappropriate for the given type of inode. */ 245 | static inline __le32 pmfs_mask_flags(umode_t mode, __le32 flags) 246 | { 247 | flags &= cpu_to_le32(PMFS_FL_INHERITED); 248 | if (S_ISDIR(mode)) 249 | return flags; 250 | else if (S_ISREG(mode)) 251 | return flags & cpu_to_le32(PMFS_REG_FLMASK); 252 | else 253 | return flags & cpu_to_le32(PMFS_OTHER_FLMASK); 254 | } 255 | 256 | static inline int pmfs_calc_checksum(u8 *data, int n) 257 | { 258 | u16 crc = 0; 259 | 260 | crc = crc16(~0, (__u8 *)data + sizeof(__le16), n - sizeof(__le16)); 261 | if (*((__le16 *)data) == cpu_to_le16(crc)) 262 | return 0; 263 | else 264 | return 1; 265 | } 266 | 267 | struct pmfs_blocknode_lowhigh { 268 | __le64 block_low; 269 | __le64 block_high; 270 | }; 271 | 272 | struct pmfs_blocknode { 273 | struct list_head link; 274 | unsigned long block_low; 275 | unsigned long block_high; 276 | }; 277 | 278 | struct pmfs_inode_info { 279 | __u32 i_dir_start_lookup; 280 | struct list_head i_truncated; 281 | struct inode vfs_inode; 282 | }; 283 | 284 | /* 285 | * PMFS super-block data in memory 286 | */ 287 | struct pmfs_sb_info { 288 | /* 289 | * base physical and virtual address of PMFS (which is also 290 | * the pointer to the super block) 291 | */ 292 | struct block_device *s_bdev; 293 | phys_addr_t phys_addr; 294 | void *virt_addr; 295 | struct list_head block_inuse_head; 296 | unsigned long block_start; 297 | unsigned long block_end; 298 | unsigned long num_free_blocks; 299 | struct mutex s_lock; /* protects the SB's buffer-head */ 300 | 301 | /* 302 | * Backing store option: 303 | * 1 = no load, 2 = no store, 304 | * else do both 305 | */ 306 | unsigned int pmfs_backing_option; 307 | 308 | /* Mount options */ 309 | unsigned long bpi; 310 | unsigned long num_inodes; 311 | unsigned long blocksize; 312 | unsigned long initsize; 313 | unsigned long s_mount_opt; 314 | kuid_t uid; /* Mount uid for root directory */ 315 | kgid_t gid; /* Mount gid for root directory */ 316 | umode_t mode; /* Mount mode for root directory */ 317 | atomic_t next_generation; 318 | /* inode tracking */ 319 | struct mutex inode_table_mutex; 320 | unsigned int s_inodes_count; /* total inodes count (used or free) */ 321 | unsigned int s_free_inodes_count; /* free inodes count */ 322 | unsigned int s_inodes_used_count; 323 | unsigned int s_free_inode_hint; 324 | 325 | unsigned long num_blocknode_allocated; 326 | 327 | /* Journaling related structures */ 328 | uint32_t next_transaction_id; 329 | uint32_t jsize; 330 | void *journal_base_addr; 331 | struct mutex journal_mutex; 332 | struct task_struct *log_cleaner_thread; 333 | wait_queue_head_t log_cleaner_wait; 334 | bool redo_log; 335 | 336 | /* truncate list related structures */ 337 | struct list_head s_truncate; 338 | struct mutex s_truncate_lock; 339 | }; 340 | 341 | static inline struct pmfs_sb_info *PMFS_SB(struct super_block *sb) 342 | { 343 | return sb->s_fs_info; 344 | } 345 | 346 | static inline struct pmfs_inode_info *PMFS_I(struct inode *inode) 347 | { 348 | return container_of(inode, struct pmfs_inode_info, vfs_inode); 349 | } 350 | 351 | /* If this is part of a read-modify-write of the super block, 352 | * pmfs_memunlock_super() before calling! */ 353 | static inline struct pmfs_super_block *pmfs_get_super(struct super_block *sb) 354 | { 355 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 356 | 357 | return (struct pmfs_super_block *)sbi->virt_addr; 358 | } 359 | 360 | static inline pmfs_journal_t *pmfs_get_journal(struct super_block *sb) 361 | { 362 | struct pmfs_super_block *ps = pmfs_get_super(sb); 363 | 364 | return (pmfs_journal_t *)((char *)ps + 365 | le64_to_cpu(ps->s_journal_offset)); 366 | } 367 | 368 | static inline struct pmfs_inode *pmfs_get_inode_table(struct super_block *sb) 369 | { 370 | struct pmfs_super_block *ps = pmfs_get_super(sb); 371 | 372 | return (struct pmfs_inode *)((char *)ps + 373 | le64_to_cpu(ps->s_inode_table_offset)); 374 | } 375 | 376 | static inline struct pmfs_super_block *pmfs_get_redund_super(struct super_block *sb) 377 | { 378 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 379 | 380 | return (struct pmfs_super_block *)(sbi->virt_addr + PMFS_SB_SIZE); 381 | } 382 | 383 | /* If this is part of a read-modify-write of the block, 384 | * pmfs_memunlock_block() before calling! */ 385 | static inline void *pmfs_get_block(struct super_block *sb, u64 block) 386 | { 387 | struct pmfs_super_block *ps = pmfs_get_super(sb); 388 | 389 | return block ? ((void *)ps + block) : NULL; 390 | } 391 | 392 | /* uses CPU instructions to atomically write up to 8 bytes */ 393 | static inline void pmfs_memcpy_atomic (void *dst, const void *src, u8 size) 394 | { 395 | switch (size) { 396 | case 1: { 397 | volatile u8 *daddr = dst; 398 | const u8 *saddr = src; 399 | *daddr = *saddr; 400 | break; 401 | } 402 | case 2: { 403 | volatile __le16 *daddr = dst; 404 | const u16 *saddr = src; 405 | *daddr = cpu_to_le16(*saddr); 406 | break; 407 | } 408 | case 4: { 409 | volatile __le32 *daddr = dst; 410 | const u32 *saddr = src; 411 | *daddr = cpu_to_le32(*saddr); 412 | break; 413 | } 414 | case 8: { 415 | volatile __le64 *daddr = dst; 416 | const u64 *saddr = src; 417 | *daddr = cpu_to_le64(*saddr); 418 | break; 419 | } 420 | default: 421 | pmfs_dbg("error: memcpy_atomic called with %d bytes\n", size); 422 | //BUG(); 423 | } 424 | } 425 | 426 | static inline void pmfs_update_time_and_size(struct inode *inode, 427 | struct pmfs_inode *pi) 428 | { 429 | __le32 words[2]; 430 | __le64 new_pi_size = cpu_to_le64(i_size_read(inode)); 431 | 432 | /* pi->i_size, pi->i_ctime, and pi->i_mtime need to be atomically updated. 433 | * So use cmpxchg16b here. */ 434 | words[0] = cpu_to_le32(inode->i_ctime.tv_sec); 435 | words[1] = cpu_to_le32(inode->i_mtime.tv_sec); 436 | /* TODO: the following function assumes cmpxchg16b instruction writes 437 | * 16 bytes atomically. Confirm if it is really true. */ 438 | cmpxchg_double_local(&pi->i_size, (u64 *)&pi->i_ctime, pi->i_size, 439 | *(u64 *)&pi->i_ctime, new_pi_size, *(u64 *)words); 440 | } 441 | 442 | /* assumes the length to be 4-byte aligned */ 443 | static inline void memset_nt(void *dest, uint32_t dword, size_t length) 444 | { 445 | uint64_t dummy1, dummy2; 446 | uint64_t qword = ((uint64_t)dword << 32) | dword; 447 | 448 | asm volatile ("movl %%edx,%%ecx\n" 449 | "andl $63,%%edx\n" 450 | "shrl $6,%%ecx\n" 451 | "jz 9f\n" 452 | "1: movnti %%rax,(%%rdi)\n" 453 | "2: movnti %%rax,1*8(%%rdi)\n" 454 | "3: movnti %%rax,2*8(%%rdi)\n" 455 | "4: movnti %%rax,3*8(%%rdi)\n" 456 | "5: movnti %%rax,4*8(%%rdi)\n" 457 | "8: movnti %%rax,5*8(%%rdi)\n" 458 | "7: movnti %%rax,6*8(%%rdi)\n" 459 | "8: movnti %%rax,7*8(%%rdi)\n" 460 | "leaq 64(%%rdi),%%rdi\n" 461 | "decl %%ecx\n" 462 | "jnz 1b\n" 463 | "9: movl %%edx,%%ecx\n" 464 | "andl $7,%%edx\n" 465 | "shrl $3,%%ecx\n" 466 | "jz 11f\n" 467 | "10: movnti %%rax,(%%rdi)\n" 468 | "leaq 8(%%rdi),%%rdi\n" 469 | "decl %%ecx\n" 470 | "jnz 10b\n" 471 | "11: movl %%edx,%%ecx\n" 472 | "shrl $2,%%ecx\n" 473 | "jz 12f\n" 474 | "movnti %%eax,(%%rdi)\n" 475 | "12:\n" 476 | : "=D"(dummy1), "=d" (dummy2) : "D" (dest), "a" (qword), "d" (length) : "memory", "rcx"); 477 | } 478 | 479 | static inline u64 __pmfs_find_data_block(struct super_block *sb, 480 | struct pmfs_inode *pi, unsigned long blocknr) 481 | { 482 | __le64 *level_ptr; 483 | u64 bp = 0; 484 | u32 height, bit_shift; 485 | unsigned int idx; 486 | 487 | height = pi->height; 488 | bp = le64_to_cpu(pi->root); 489 | 490 | while (height > 0) { 491 | level_ptr = pmfs_get_block(sb, bp); 492 | bit_shift = (height - 1) * META_BLK_SHIFT; 493 | idx = blocknr >> bit_shift; 494 | bp = le64_to_cpu(level_ptr[idx]); 495 | if (bp == 0) 496 | return 0; 497 | blocknr = blocknr & ((1 << bit_shift) - 1); 498 | height--; 499 | } 500 | return bp; 501 | } 502 | 503 | static inline unsigned int pmfs_inode_blk_shift (struct pmfs_inode *pi) 504 | { 505 | return blk_type_to_shift[pi->i_blk_type]; 506 | } 507 | 508 | static inline uint32_t pmfs_inode_blk_size (struct pmfs_inode *pi) 509 | { 510 | return blk_type_to_size[pi->i_blk_type]; 511 | } 512 | 513 | /* If this is part of a read-modify-write of the inode metadata, 514 | * pmfs_memunlock_inode() before calling! */ 515 | static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb, 516 | u64 ino) 517 | { 518 | struct pmfs_super_block *ps = pmfs_get_super(sb); 519 | struct pmfs_inode *inode_table = pmfs_get_inode_table(sb); 520 | u64 bp, block, ino_offset; 521 | 522 | if (ino == 0) 523 | return NULL; 524 | 525 | block = ino >> pmfs_inode_blk_shift(inode_table); 526 | bp = __pmfs_find_data_block(sb, inode_table, block); 527 | 528 | if (bp == 0) 529 | return NULL; 530 | ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1)); 531 | return (struct pmfs_inode *)((void *)ps + bp + ino_offset); 532 | } 533 | 534 | static inline u64 535 | pmfs_get_addr_off(struct pmfs_sb_info *sbi, void *addr) 536 | { 537 | PMFS_ASSERT((addr >= sbi->virt_addr) && 538 | (addr < (sbi->virt_addr + sbi->initsize))); 539 | return (u64)(addr - sbi->virt_addr); 540 | } 541 | 542 | static inline u64 543 | pmfs_get_block_off(struct super_block *sb, unsigned long blocknr, 544 | unsigned short btype) 545 | { 546 | return (u64)blocknr << PAGE_SHIFT; 547 | } 548 | 549 | static inline unsigned long 550 | pmfs_get_numblocks(unsigned short btype) 551 | { 552 | unsigned long num_blocks; 553 | 554 | if (btype == PMFS_BLOCK_TYPE_4K) { 555 | num_blocks = 1; 556 | } else if (btype == PMFS_BLOCK_TYPE_2M) { 557 | num_blocks = 512; 558 | } else { 559 | //btype == PMFS_BLOCK_TYPE_1G 560 | num_blocks = 0x40000; 561 | } 562 | return num_blocks; 563 | } 564 | 565 | static inline unsigned long 566 | pmfs_get_blocknr(struct super_block *sb, u64 block, unsigned short btype) 567 | { 568 | return block >> PAGE_SHIFT; 569 | } 570 | 571 | static inline unsigned long pmfs_get_pfn(struct super_block *sb, u64 block) 572 | { 573 | return (PMFS_SB(sb)->phys_addr + block) >> PAGE_SHIFT; 574 | } 575 | 576 | static inline int pmfs_is_mounting(struct super_block *sb) 577 | { 578 | struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info; 579 | return sbi->s_mount_opt & PMFS_MOUNT_MOUNTING; 580 | } 581 | 582 | static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_item (struct 583 | super_block *sb, u64 ino) 584 | { 585 | struct pmfs_inode *pi = pmfs_get_inode(sb, ino); 586 | return (struct pmfs_inode_truncate_item *)(pi + 1); 587 | } 588 | 589 | static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_list_head ( 590 | struct super_block *sb) 591 | { 592 | struct pmfs_inode *pi = pmfs_get_inode_table(sb); 593 | return (struct pmfs_inode_truncate_item *)(pi + 1); 594 | } 595 | 596 | static inline void check_eof_blocks(struct super_block *sb, 597 | struct pmfs_inode *pi, loff_t size) 598 | { 599 | if ((pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) && 600 | (size + sb->s_blocksize) > (le64_to_cpu(pi->i_blocks) 601 | << sb->s_blocksize_bits)) 602 | pi->i_flags &= cpu_to_le32(~PMFS_EOFBLOCKS_FL); 603 | } 604 | 605 | #include "wprotect.h" 606 | 607 | /* 608 | * Inodes and files operations 609 | */ 610 | 611 | /* dir.c */ 612 | extern const struct file_operations pmfs_dir_operations; 613 | 614 | /* file.c */ 615 | extern const struct inode_operations pmfs_file_inode_operations; 616 | extern const struct file_operations pmfs_xip_file_operations; 617 | int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync); 618 | 619 | /* inode.c */ 620 | extern const struct address_space_operations pmfs_aops_xip; 621 | 622 | /* bbuild.c */ 623 | void pmfs_save_blocknode_mappings(struct super_block *sb); 624 | 625 | /* namei.c */ 626 | extern const struct inode_operations pmfs_dir_inode_operations; 627 | extern const struct inode_operations pmfs_special_inode_operations; 628 | 629 | /* symlink.c */ 630 | extern const struct inode_operations pmfs_symlink_inode_operations; 631 | 632 | int pmfs_check_integrity(struct super_block *sb, 633 | struct pmfs_super_block *super); 634 | void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr, 635 | ssize_t size); 636 | 637 | int pmfs_check_dir_entry(const char *function, struct inode *dir, 638 | struct pmfs_direntry *de, u8 *base, 639 | unsigned long offset); 640 | 641 | static inline int pmfs_match(int len, const char *const name, 642 | struct pmfs_direntry *de) 643 | { 644 | if (len == de->name_len && de->ino && !memcmp(de->name, name, len)) 645 | return 1; 646 | return 0; 647 | } 648 | 649 | int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child, 650 | unsigned long offset, 651 | struct pmfs_direntry **res_dir, 652 | struct pmfs_direntry **prev_dir); 653 | 654 | /* pmfs_stats.c */ 655 | #define PMFS_PRINT_TIMING 0xBCD00010 656 | #define PMFS_CLEAR_STATS 0xBCD00011 657 | void pmfs_print_timing_stats(void); 658 | void pmfs_clear_stats(void); 659 | 660 | #endif /* __PMFS_H */ 661 | -------------------------------------------------------------------------------- /namei.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Inode operations for directories. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | #include 16 | #include 17 | #include "pmfs.h" 18 | #include "xip.h" 19 | 20 | /* 21 | * Couple of helper functions - make the code slightly cleaner. 22 | */ 23 | static inline void pmfs_inc_count(struct inode *inode, struct pmfs_inode *pi) 24 | { 25 | inc_nlink(inode); 26 | pmfs_update_nlink(inode, pi); 27 | } 28 | 29 | static inline void pmfs_dec_count(struct inode *inode, struct pmfs_inode *pi) 30 | { 31 | if (inode->i_nlink) { 32 | drop_nlink(inode); 33 | pmfs_update_nlink(inode, pi); 34 | } 35 | } 36 | 37 | static inline int pmfs_add_nondir(pmfs_transaction_t *trans, 38 | struct inode *dir, struct dentry *dentry, struct inode *inode) 39 | { 40 | struct pmfs_inode *pi; 41 | int err = pmfs_add_entry(trans, dentry, inode); 42 | 43 | if (!err) { 44 | d_instantiate(dentry, inode); 45 | unlock_new_inode(inode); 46 | return 0; 47 | } 48 | pi = pmfs_get_inode(inode->i_sb, inode->i_ino); 49 | pmfs_dec_count(inode, pi); 50 | unlock_new_inode(inode); 51 | iput(inode); 52 | return err; 53 | } 54 | 55 | static inline struct pmfs_direntry *pmfs_next_entry(struct pmfs_direntry *p) 56 | { 57 | return (struct pmfs_direntry *)((char *)p + le16_to_cpu(p->de_len)); 58 | } 59 | 60 | /* 61 | * Methods themselves. 62 | */ 63 | int pmfs_check_dir_entry(const char *function, struct inode *dir, 64 | struct pmfs_direntry *de, u8 *base, 65 | unsigned long offset) 66 | { 67 | const char *error_msg = NULL; 68 | const int rlen = le16_to_cpu(de->de_len); 69 | 70 | if (unlikely(rlen < PMFS_DIR_REC_LEN(1))) 71 | error_msg = "de_len is smaller than minimal"; 72 | else if (unlikely(rlen % 4 != 0)) 73 | error_msg = "de_len % 4 != 0"; 74 | else if (unlikely(rlen < PMFS_DIR_REC_LEN(de->name_len))) 75 | error_msg = "de_len is too small for name_len"; 76 | else if (unlikely((((u8 *)de - base) + rlen > dir->i_sb->s_blocksize))) 77 | error_msg = "directory entry across blocks"; 78 | 79 | if (unlikely(error_msg != NULL)) { 80 | pmfs_dbg("bad entry in directory #%lu: %s - " 81 | "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 82 | dir->i_ino, error_msg, offset, 83 | (unsigned long)le64_to_cpu(de->ino), rlen, 84 | de->name_len); 85 | } 86 | 87 | return error_msg == NULL ? 1 : 0; 88 | } 89 | 90 | /* 91 | * Returns 0 if not found, -1 on failure, and 1 on success 92 | */ 93 | int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child, 94 | unsigned long offset, 95 | struct pmfs_direntry **res_dir, 96 | struct pmfs_direntry **prev_dir) 97 | { 98 | struct pmfs_direntry *de; 99 | struct pmfs_direntry *pde = NULL; 100 | char *dlimit; 101 | int de_len; 102 | const char *name = child->name; 103 | int namelen = child->len; 104 | 105 | de = (struct pmfs_direntry *)blk_base; 106 | dlimit = blk_base + dir->i_sb->s_blocksize; 107 | while ((char *)de < dlimit) { 108 | /* this code is executed quadratically often */ 109 | /* do minimal checking `by hand' */ 110 | 111 | if ((char *)de + namelen <= dlimit && 112 | pmfs_match(namelen, name, de)) { 113 | /* found a match - just to be sure, do a full check */ 114 | if (!pmfs_check_dir_entry("pmfs_inode_by_name", 115 | dir, de, blk_base, offset)) 116 | return -1; 117 | *res_dir = de; 118 | if (prev_dir) 119 | *prev_dir = pde; 120 | return 1; 121 | } 122 | /* prevent looping on a bad block */ 123 | de_len = le16_to_cpu(de->de_len); 124 | if (de_len <= 0) 125 | return -1; 126 | offset += de_len; 127 | pde = de; 128 | de = (struct pmfs_direntry *)((char *)de + de_len); 129 | } 130 | return 0; 131 | } 132 | 133 | static ino_t pmfs_inode_by_name(struct inode *dir, struct qstr *entry, 134 | struct pmfs_direntry **res_entry) 135 | { 136 | struct pmfs_inode *pi; 137 | ino_t i_no = 0; 138 | int namelen, nblocks, i; 139 | u8 *blk_base; 140 | const u8 *name = entry->name; 141 | struct super_block *sb = dir->i_sb; 142 | unsigned long block, start; 143 | struct pmfs_inode_info *si = PMFS_I(dir); 144 | 145 | pi = pmfs_get_inode(sb, dir->i_ino); 146 | 147 | namelen = entry->len; 148 | if (namelen > PMFS_NAME_LEN) 149 | return 0; 150 | if ((namelen <= 2) && (name[0] == '.') && 151 | (name[1] == '.' || name[1] == 0)) { 152 | /* 153 | * "." or ".." will only be in the first block 154 | */ 155 | block = start = 0; 156 | nblocks = 1; 157 | goto restart; 158 | } 159 | nblocks = dir->i_size >> dir->i_sb->s_blocksize_bits; 160 | start = si->i_dir_start_lookup; 161 | if (start >= nblocks) 162 | start = 0; 163 | block = start; 164 | restart: 165 | do { 166 | blk_base = 167 | pmfs_get_block(sb, pmfs_find_data_block(dir, block)); 168 | if (!blk_base) 169 | goto done; 170 | i = pmfs_search_dirblock(blk_base, dir, entry, 171 | block << sb->s_blocksize_bits, 172 | res_entry, NULL); 173 | if (i == 1) { 174 | si->i_dir_start_lookup = block; 175 | i_no = le64_to_cpu((*res_entry)->ino); 176 | goto done; 177 | } else { 178 | if (i < 0) 179 | goto done; 180 | } 181 | if (++block >= nblocks) 182 | block = 0; 183 | } while (block != start); 184 | /* 185 | * If the directory has grown while we were searching, then 186 | * search the last part of the directory before giving up. 187 | */ 188 | block = nblocks; 189 | nblocks = dir->i_size >> sb->s_blocksize_bits; 190 | if (block < nblocks) { 191 | start = 0; 192 | goto restart; 193 | } 194 | done: 195 | return i_no; 196 | } 197 | 198 | static struct dentry *pmfs_lookup(struct inode *dir, struct dentry *dentry, 199 | unsigned int flags) 200 | { 201 | struct inode *inode = NULL; 202 | struct pmfs_direntry *de; 203 | ino_t ino; 204 | 205 | if (dentry->d_name.len > PMFS_NAME_LEN) 206 | return ERR_PTR(-ENAMETOOLONG); 207 | 208 | ino = pmfs_inode_by_name(dir, &dentry->d_name, &de); 209 | if (ino) { 210 | inode = pmfs_iget(dir->i_sb, ino); 211 | if (inode == ERR_PTR(-ESTALE)) { 212 | pmfs_err(dir->i_sb, __func__, 213 | "deleted inode referenced: %lu", 214 | (unsigned long)ino); 215 | return ERR_PTR(-EIO); 216 | } 217 | } 218 | 219 | return d_splice_alias(inode, dentry); 220 | } 221 | 222 | /* 223 | * By the time this is called, we already have created 224 | * the directory cache entry for the new file, but it 225 | * is so far negative - it has no inode. 226 | * 227 | * If the create succeeds, we fill in the inode information 228 | * with d_instantiate(). 229 | */ 230 | static int pmfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 231 | bool excl) 232 | { 233 | struct inode *inode = NULL; 234 | int err = PTR_ERR(inode); 235 | struct super_block *sb = dir->i_sb; 236 | pmfs_transaction_t *trans; 237 | timing_t create_time; 238 | 239 | PMFS_START_TIMING(create_t, create_time); 240 | /* two log entries for new inode, 1 lentry for dir inode, 1 for dir 241 | * inode's b-tree, 2 lentries for logging dir entry 242 | */ 243 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 244 | MAX_DIRENTRY_LENTRIES); 245 | if (IS_ERR(trans)) { 246 | err = PTR_ERR(trans); 247 | goto out; 248 | } 249 | 250 | inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name); 251 | if (IS_ERR(inode)) 252 | goto out_err; 253 | pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__, 254 | dentry->d_name.name, inode->i_ino); 255 | inode->i_op = &pmfs_file_inode_operations; 256 | inode->i_mapping->a_ops = &pmfs_aops_xip; 257 | inode->i_fop = &pmfs_xip_file_operations; 258 | err = pmfs_add_nondir(trans, dir, dentry, inode); 259 | if (err) 260 | goto out_err; 261 | pmfs_commit_transaction(sb, trans); 262 | out: 263 | PMFS_END_TIMING(create_t, create_time); 264 | return err; 265 | out_err: 266 | pmfs_abort_transaction(sb, trans); 267 | return err; 268 | } 269 | 270 | static int pmfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 271 | dev_t rdev) 272 | { 273 | struct inode *inode = NULL; 274 | int err = PTR_ERR(inode); 275 | pmfs_transaction_t *trans; 276 | struct super_block *sb = dir->i_sb; 277 | struct pmfs_inode *pi; 278 | 279 | /* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir 280 | * inode's b-tree, 2 lentries for logging dir entry 281 | */ 282 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 283 | MAX_DIRENTRY_LENTRIES); 284 | if (IS_ERR(trans)) { 285 | err = PTR_ERR(trans); 286 | goto out; 287 | } 288 | 289 | inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name); 290 | if (IS_ERR(inode)) 291 | goto out_err; 292 | init_special_inode(inode, mode, rdev); 293 | inode->i_op = &pmfs_special_inode_operations; 294 | 295 | pi = pmfs_get_inode(sb, inode->i_ino); 296 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) 297 | pi->dev.rdev = cpu_to_le32(inode->i_rdev); 298 | err = pmfs_add_nondir(trans, dir, dentry, inode); 299 | if (err) 300 | goto out_err; 301 | pmfs_commit_transaction(sb, trans); 302 | out: 303 | return err; 304 | out_err: 305 | pmfs_abort_transaction(sb, trans); 306 | return err; 307 | } 308 | 309 | static int pmfs_symlink(struct inode *dir, struct dentry *dentry, 310 | const char *symname) 311 | { 312 | struct super_block *sb = dir->i_sb; 313 | int err = -ENAMETOOLONG; 314 | unsigned len = strlen(symname); 315 | struct inode *inode; 316 | pmfs_transaction_t *trans; 317 | struct pmfs_inode *pi; 318 | 319 | if (len + 1 > sb->s_blocksize) 320 | goto out; 321 | 322 | /* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir 323 | * inode's b-tree, 2 lentries for logging dir entry 324 | */ 325 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 326 | MAX_DIRENTRY_LENTRIES); 327 | if (IS_ERR(trans)) { 328 | err = PTR_ERR(trans); 329 | goto out; 330 | } 331 | 332 | inode = pmfs_new_inode(trans, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name); 333 | err = PTR_ERR(inode); 334 | if (IS_ERR(inode)) { 335 | pmfs_abort_transaction(sb, trans); 336 | goto out; 337 | } 338 | 339 | inode->i_op = &pmfs_symlink_inode_operations; 340 | inode->i_mapping->a_ops = &pmfs_aops_xip; 341 | 342 | pi = pmfs_get_inode(sb, inode->i_ino); 343 | err = pmfs_block_symlink(inode, symname, len); 344 | if (err) 345 | goto out_fail; 346 | 347 | inode->i_size = len; 348 | pmfs_update_isize(inode, pi); 349 | 350 | err = pmfs_add_nondir(trans, dir, dentry, inode); 351 | if (err) { 352 | pmfs_abort_transaction(sb, trans); 353 | goto out; 354 | } 355 | 356 | pmfs_commit_transaction(sb, trans); 357 | out: 358 | return err; 359 | 360 | out_fail: 361 | pmfs_dec_count(inode, pi); 362 | unlock_new_inode(inode); 363 | iput(inode); 364 | pmfs_abort_transaction(sb, trans); 365 | goto out; 366 | } 367 | 368 | static int pmfs_link(struct dentry *dest_dentry, struct inode *dir, 369 | struct dentry *dentry) 370 | { 371 | struct inode *inode = dest_dentry->d_inode; 372 | int err = -ENOMEM; 373 | pmfs_transaction_t *trans; 374 | struct super_block *sb = inode->i_sb; 375 | struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino); 376 | 377 | if (inode->i_nlink >= PMFS_LINK_MAX) 378 | return -EMLINK; 379 | 380 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 381 | MAX_DIRENTRY_LENTRIES); 382 | if (IS_ERR(trans)) { 383 | err = PTR_ERR(trans); 384 | goto out; 385 | } 386 | /* only need to log the first 48 bytes since we only modify ctime and 387 | * i_links_count in this system call */ 388 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 389 | 390 | ihold(inode); 391 | 392 | err = pmfs_add_entry(trans, dentry, inode); 393 | if (!err) { 394 | inode->i_ctime = current_time(inode); 395 | inc_nlink(inode); 396 | 397 | pmfs_memunlock_inode(sb, pi); 398 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 399 | pi->i_links_count = cpu_to_le16(inode->i_nlink); 400 | pmfs_memlock_inode(sb, pi); 401 | 402 | d_instantiate(dentry, inode); 403 | pmfs_commit_transaction(sb, trans); 404 | } else { 405 | iput(inode); 406 | pmfs_abort_transaction(sb, trans); 407 | } 408 | out: 409 | return err; 410 | } 411 | 412 | static int pmfs_unlink(struct inode *dir, struct dentry *dentry) 413 | { 414 | struct inode *inode = dentry->d_inode; 415 | int retval = -ENOMEM; 416 | pmfs_transaction_t *trans; 417 | struct super_block *sb = inode->i_sb; 418 | struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino); 419 | timing_t unlink_time; 420 | 421 | PMFS_START_TIMING(unlink_t, unlink_time); 422 | 423 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 424 | MAX_DIRENTRY_LENTRIES); 425 | if (IS_ERR(trans)) { 426 | retval = PTR_ERR(trans); 427 | goto out; 428 | } 429 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 430 | 431 | pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__, 432 | dentry->d_name.name, inode->i_ino); 433 | retval = pmfs_remove_entry(trans, dentry, inode); 434 | if (retval) 435 | goto end_unlink; 436 | 437 | if (inode->i_nlink == 1) 438 | pmfs_truncate_add(inode, inode->i_size); 439 | inode->i_ctime = dir->i_ctime; 440 | 441 | pmfs_memunlock_inode(sb, pi); 442 | if (inode->i_nlink) { 443 | drop_nlink(inode); 444 | pi->i_links_count = cpu_to_le16(inode->i_nlink); 445 | } 446 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 447 | pmfs_memlock_inode(sb, pi); 448 | 449 | pmfs_commit_transaction(sb, trans); 450 | PMFS_END_TIMING(unlink_t, unlink_time); 451 | return 0; 452 | end_unlink: 453 | pmfs_abort_transaction(sb, trans); 454 | out: 455 | return retval; 456 | } 457 | 458 | static int pmfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 459 | { 460 | struct inode *inode; 461 | struct pmfs_inode *pi, *pidir; 462 | struct pmfs_direntry *de = NULL; 463 | struct super_block *sb = dir->i_sb; 464 | pmfs_transaction_t *trans; 465 | int err = -EMLINK; 466 | char *blk_base; 467 | 468 | if (dir->i_nlink >= PMFS_LINK_MAX) 469 | goto out; 470 | 471 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 472 | MAX_DIRENTRY_LENTRIES); 473 | if (IS_ERR(trans)) { 474 | err = PTR_ERR(trans); 475 | goto out; 476 | } 477 | 478 | inode = pmfs_new_inode(trans, dir, S_IFDIR | mode, &dentry->d_name); 479 | err = PTR_ERR(inode); 480 | if (IS_ERR(inode)) { 481 | pmfs_abort_transaction(sb, trans); 482 | goto out; 483 | } 484 | 485 | pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__, 486 | dentry->d_name.name, inode->i_ino); 487 | inode->i_op = &pmfs_dir_inode_operations; 488 | inode->i_fop = &pmfs_dir_operations; 489 | inode->i_mapping->a_ops = &pmfs_aops_xip; 490 | 491 | /* since this is a new inode so we don't need to include this 492 | * pmfs_alloc_blocks in the transaction 493 | */ 494 | err = pmfs_alloc_blocks(NULL, inode, 0, 1, false); 495 | if (err) 496 | goto out_clear_inode; 497 | inode->i_size = sb->s_blocksize; 498 | 499 | blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0)); 500 | de = (struct pmfs_direntry *)blk_base; 501 | pmfs_memunlock_range(sb, blk_base, sb->s_blocksize); 502 | de->ino = cpu_to_le64(inode->i_ino); 503 | de->name_len = 1; 504 | de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len)); 505 | strcpy(de->name, "."); 506 | /*de->file_type = S_IFDIR; */ 507 | de = pmfs_next_entry(de); 508 | de->ino = cpu_to_le64(dir->i_ino); 509 | de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1)); 510 | de->name_len = 2; 511 | strcpy(de->name, ".."); 512 | /*de->file_type = S_IFDIR; */ 513 | pmfs_memlock_range(sb, blk_base, sb->s_blocksize); 514 | 515 | /* No need to journal the dir entries but we need to persist them */ 516 | pmfs_flush_buffer(blk_base, PMFS_DIR_REC_LEN(1) + 517 | PMFS_DIR_REC_LEN(2), true); 518 | 519 | set_nlink(inode, 2); 520 | 521 | err = pmfs_add_entry(trans, dentry, inode); 522 | if (err) { 523 | pmfs_dbg_verbose("failed to add dir entry\n"); 524 | goto out_clear_inode; 525 | } 526 | pi = pmfs_get_inode(sb, inode->i_ino); 527 | pmfs_memunlock_inode(sb, pi); 528 | pi->i_links_count = cpu_to_le16(inode->i_nlink); 529 | pi->i_size = cpu_to_le64(inode->i_size); 530 | pmfs_memlock_inode(sb, pi); 531 | 532 | pidir = pmfs_get_inode(sb, dir->i_ino); 533 | pmfs_inc_count(dir, pidir); 534 | d_instantiate(dentry, inode); 535 | unlock_new_inode(inode); 536 | 537 | pmfs_commit_transaction(sb, trans); 538 | 539 | out: 540 | return err; 541 | 542 | out_clear_inode: 543 | clear_nlink(inode); 544 | unlock_new_inode(inode); 545 | iput(inode); 546 | pmfs_abort_transaction(sb, trans); 547 | goto out; 548 | } 549 | 550 | /* 551 | * routine to check that the specified directory is empty (for rmdir) 552 | */ 553 | static int pmfs_empty_dir(struct inode *inode) 554 | { 555 | unsigned long offset; 556 | struct pmfs_direntry *de, *de1; 557 | struct super_block *sb; 558 | char *blk_base; 559 | int err = 0; 560 | 561 | sb = inode->i_sb; 562 | if (inode->i_size < PMFS_DIR_REC_LEN(1) + PMFS_DIR_REC_LEN(2)) { 563 | pmfs_dbg("bad directory (dir #%lu)-no data block", 564 | inode->i_ino); 565 | return 1; 566 | } 567 | 568 | blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0)); 569 | if (!blk_base) { 570 | pmfs_dbg("bad directory (dir #%lu)-no data block", 571 | inode->i_ino); 572 | return 1; 573 | } 574 | 575 | de = (struct pmfs_direntry *)blk_base; 576 | de1 = pmfs_next_entry(de); 577 | 578 | if (le64_to_cpu(de->ino) != inode->i_ino || !le64_to_cpu(de1->ino) || 579 | strcmp(".", de->name) || strcmp("..", de1->name)) { 580 | pmfs_dbg("bad directory (dir #%lu) - no `.' or `..'", 581 | inode->i_ino); 582 | return 1; 583 | } 584 | offset = le16_to_cpu(de->de_len) + le16_to_cpu(de1->de_len); 585 | de = pmfs_next_entry(de1); 586 | while (offset < inode->i_size) { 587 | if (!blk_base || (void *)de >= (void *)(blk_base + 588 | sb->s_blocksize)) { 589 | err = 0; 590 | blk_base = pmfs_get_block(sb, pmfs_find_data_block( 591 | inode, offset >> sb->s_blocksize_bits)); 592 | if (!blk_base) { 593 | pmfs_dbg("Error: reading dir #%lu offset %lu\n", 594 | inode->i_ino, offset); 595 | offset += sb->s_blocksize; 596 | continue; 597 | } 598 | de = (struct pmfs_direntry *)blk_base; 599 | } 600 | if (!pmfs_check_dir_entry("empty_dir", inode, de, blk_base, 601 | offset)) { 602 | de = (struct pmfs_direntry *)(blk_base + 603 | sb->s_blocksize); 604 | offset = (offset | (sb->s_blocksize - 1)) + 1; 605 | continue; 606 | } 607 | if (le64_to_cpu(de->ino)) 608 | return 0; 609 | offset += le16_to_cpu(de->de_len); 610 | de = pmfs_next_entry(de); 611 | } 612 | return 1; 613 | } 614 | 615 | static int pmfs_rmdir(struct inode *dir, struct dentry *dentry) 616 | { 617 | struct inode *inode = dentry->d_inode; 618 | struct pmfs_direntry *de; 619 | pmfs_transaction_t *trans; 620 | struct super_block *sb = inode->i_sb; 621 | struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino), *pidir; 622 | int err = -ENOTEMPTY; 623 | 624 | if (!inode) 625 | return -ENOENT; 626 | 627 | pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__, 628 | dentry->d_name.name, inode->i_ino); 629 | if (pmfs_inode_by_name(dir, &dentry->d_name, &de) == 0) 630 | return -ENOENT; 631 | 632 | if (!pmfs_empty_dir(inode)) 633 | return err; 634 | 635 | if (inode->i_nlink != 2) 636 | pmfs_dbg("empty directory has nlink!=2 (%d)", inode->i_nlink); 637 | 638 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 + 639 | MAX_DIRENTRY_LENTRIES); 640 | if (IS_ERR(trans)) { 641 | err = PTR_ERR(trans); 642 | return err; 643 | } 644 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 645 | 646 | err = pmfs_remove_entry(trans, dentry, inode); 647 | if (err) 648 | goto end_rmdir; 649 | 650 | /*inode->i_version++; */ 651 | clear_nlink(inode); 652 | inode->i_ctime = dir->i_ctime; 653 | 654 | pmfs_memunlock_inode(sb, pi); 655 | pi->i_links_count = cpu_to_le16(inode->i_nlink); 656 | pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); 657 | pmfs_memlock_inode(sb, pi); 658 | 659 | /* add the inode to truncate list in case a crash happens before the 660 | * subsequent evict_inode is called. It will be deleted from the 661 | * truncate list during evict_inode. 662 | */ 663 | pmfs_truncate_add(inode, inode->i_size); 664 | 665 | pidir = pmfs_get_inode(sb, dir->i_ino); 666 | pmfs_dec_count(dir, pidir); 667 | 668 | pmfs_commit_transaction(sb, trans); 669 | return err; 670 | end_rmdir: 671 | pmfs_abort_transaction(sb, trans); 672 | return err; 673 | } 674 | 675 | static int pmfs_rename(struct inode *old_dir, 676 | struct dentry *old_dentry, 677 | struct inode *new_dir, struct dentry *new_dentry, 678 | unsigned int flags) 679 | { 680 | struct inode *old_inode = old_dentry->d_inode; 681 | struct inode *new_inode = new_dentry->d_inode; 682 | struct pmfs_direntry *new_de = NULL, *old_de = NULL; 683 | pmfs_transaction_t *trans; 684 | struct super_block *sb = old_inode->i_sb; 685 | struct pmfs_inode *pi, *new_pidir, *old_pidir; 686 | int err = -ENOENT; 687 | 688 | pmfs_inode_by_name(new_dir, &new_dentry->d_name, &new_de); 689 | pmfs_inode_by_name(old_dir, &old_dentry->d_name, &old_de); 690 | 691 | pmfs_dbg_verbose("%s: rename %s to %s\n", __func__, 692 | old_dentry->d_name.name, new_dentry->d_name.name); 693 | trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 4 + 694 | MAX_DIRENTRY_LENTRIES * 2); 695 | if (IS_ERR(trans)) { 696 | return PTR_ERR(trans); 697 | } 698 | 699 | if (new_inode) { 700 | err = -ENOTEMPTY; 701 | if (S_ISDIR(old_inode->i_mode) && !pmfs_empty_dir(new_inode)) 702 | goto out; 703 | } else { 704 | if (S_ISDIR(old_inode->i_mode)) { 705 | err = -EMLINK; 706 | if (new_dir->i_nlink >= PMFS_LINK_MAX) 707 | goto out; 708 | } 709 | } 710 | 711 | new_pidir = pmfs_get_inode(sb, new_dir->i_ino); 712 | 713 | pi = pmfs_get_inode(sb, old_inode->i_ino); 714 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 715 | 716 | if (!new_de) { 717 | /* link it into the new directory. */ 718 | err = pmfs_add_entry(trans, new_dentry, old_inode); 719 | if (err) 720 | goto out; 721 | } else { 722 | pmfs_add_logentry(sb, trans, &new_de->ino, sizeof(new_de->ino), 723 | LE_DATA); 724 | 725 | pmfs_memunlock_range(sb, new_de, sb->s_blocksize); 726 | new_de->ino = cpu_to_le64(old_inode->i_ino); 727 | /*new_de->file_type = old_de->file_type; */ 728 | pmfs_memlock_range(sb, new_de, sb->s_blocksize); 729 | 730 | pmfs_add_logentry(sb, trans, new_pidir, MAX_DATA_PER_LENTRY, 731 | LE_DATA); 732 | /*new_dir->i_version++; */ 733 | new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); 734 | pmfs_update_time(new_dir, new_pidir); 735 | } 736 | 737 | /* and unlink the inode from the old directory ... */ 738 | err = pmfs_remove_entry(trans, old_dentry, old_inode); 739 | if (err) 740 | goto out; 741 | 742 | if (new_inode) { 743 | pi = pmfs_get_inode(sb, new_inode->i_ino); 744 | pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA); 745 | new_inode->i_ctime = current_time(new_inode); 746 | 747 | pmfs_memunlock_inode(sb, pi); 748 | if (S_ISDIR(old_inode->i_mode)) { 749 | if (new_inode->i_nlink) 750 | drop_nlink(new_inode); 751 | } 752 | pi->i_ctime = cpu_to_le32(new_inode->i_ctime.tv_sec); 753 | if (new_inode->i_nlink) 754 | drop_nlink(new_inode); 755 | pi->i_links_count = cpu_to_le16(new_inode->i_nlink); 756 | pmfs_memlock_inode(sb, pi); 757 | 758 | if (!new_inode->i_nlink) 759 | pmfs_truncate_add(new_inode, new_inode->i_size); 760 | } else { 761 | if (S_ISDIR(old_inode->i_mode)) { 762 | pmfs_inc_count(new_dir, new_pidir); 763 | old_pidir = pmfs_get_inode(sb, old_dir->i_ino); 764 | pmfs_dec_count(old_dir, old_pidir); 765 | } 766 | } 767 | 768 | pmfs_commit_transaction(sb, trans); 769 | return 0; 770 | out: 771 | pmfs_abort_transaction(sb, trans); 772 | return err; 773 | } 774 | 775 | struct dentry *pmfs_get_parent(struct dentry *child) 776 | { 777 | struct inode *inode; 778 | struct qstr dotdot = QSTR_INIT("..", 2); 779 | struct pmfs_direntry *de = NULL; 780 | ino_t ino; 781 | 782 | pmfs_inode_by_name(child->d_inode, &dotdot, &de); 783 | if (!de) 784 | return ERR_PTR(-ENOENT); 785 | ino = le64_to_cpu(de->ino); 786 | 787 | if (ino) 788 | inode = pmfs_iget(child->d_inode->i_sb, ino); 789 | else 790 | return ERR_PTR(-ENOENT); 791 | 792 | return d_obtain_alias(inode); 793 | } 794 | 795 | const struct inode_operations pmfs_dir_inode_operations = { 796 | .create = pmfs_create, 797 | .lookup = pmfs_lookup, 798 | .link = pmfs_link, 799 | .unlink = pmfs_unlink, 800 | .symlink = pmfs_symlink, 801 | .mkdir = pmfs_mkdir, 802 | .rmdir = pmfs_rmdir, 803 | .mknod = pmfs_mknod, 804 | .rename = pmfs_rename, 805 | .setattr = pmfs_notify_change, 806 | .get_acl = NULL, 807 | }; 808 | 809 | const struct inode_operations pmfs_special_inode_operations = { 810 | .setattr = pmfs_notify_change, 811 | .get_acl = NULL, 812 | }; 813 | -------------------------------------------------------------------------------- /journal.c: -------------------------------------------------------------------------------- 1 | /* 2 | * PMFS journaling facility. This file contains code to log changes to pmfs 3 | * meta-data to facilitate consistent meta-data updates against arbitrary 4 | * power and system failures. 5 | * 6 | * Persistent Memory File System 7 | * Copyright (c) 2012-2013, Intel Corporation. 8 | * 9 | * This program is free software; you can redistribute it and/or modify it 10 | * under the terms and conditions of the GNU General Public License, 11 | * version 2, as published by the Free Software Foundation. 12 | * 13 | * This program is distributed in the hope it will be useful, but WITHOUT 14 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 16 | * more details. 17 | * 18 | * You should have received a copy of the GNU General Public License along with 19 | * this program; if not, write to the Free Software Foundation, Inc., 20 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include "pmfs.h" 33 | #include "journal.h" 34 | 35 | static void dump_transaction(struct pmfs_sb_info *sbi, 36 | pmfs_transaction_t *trans) 37 | { 38 | int i; 39 | pmfs_logentry_t *le = trans->start_addr; 40 | 41 | for (i = 0; i < trans->num_entries; i++) { 42 | pmfs_dbg_trans("ao %llx tid %x gid %x type %x sz %x\n", 43 | le->addr_offset, le->transaction_id, le->gen_id, 44 | le->type, le->size); 45 | le++; 46 | } 47 | } 48 | 49 | static inline uint32_t next_log_entry(uint32_t jsize, uint32_t le_off) 50 | { 51 | le_off = le_off + LOGENTRY_SIZE; 52 | if (le_off >= jsize) 53 | le_off = 0; 54 | return le_off; 55 | } 56 | 57 | static inline uint32_t prev_log_entry(uint32_t jsize, uint32_t le_off) 58 | { 59 | if (le_off == 0) 60 | le_off = jsize; 61 | le_off = le_off - LOGENTRY_SIZE; 62 | return le_off; 63 | } 64 | 65 | static inline uint16_t next_gen_id(uint16_t gen_id) 66 | { 67 | gen_id++; 68 | /* check for wraparound */ 69 | if (gen_id == 0) 70 | gen_id++; 71 | return gen_id; 72 | } 73 | 74 | static inline uint16_t prev_gen_id(uint16_t gen_id) 75 | { 76 | gen_id--; 77 | /* check for wraparound */ 78 | if (gen_id == 0) 79 | gen_id--; 80 | return gen_id; 81 | } 82 | 83 | /* Undo a valid log entry */ 84 | static inline void pmfs_undo_logentry(struct super_block *sb, 85 | pmfs_logentry_t *le) 86 | { 87 | char *data; 88 | 89 | if (le->size > 0) { 90 | data = pmfs_get_block(sb, le64_to_cpu(le->addr_offset)); 91 | /* Undo changes by flushing the log entry to pmfs */ 92 | pmfs_memunlock_range(sb, data, le->size); 93 | memcpy(data, le->data, le->size); 94 | pmfs_memlock_range(sb, data, le->size); 95 | pmfs_flush_buffer(data, le->size, false); 96 | } 97 | } 98 | 99 | /* can be called during journal recovery or transaction abort */ 100 | /* We need to Undo in the reverse order */ 101 | static void pmfs_undo_transaction(struct super_block *sb, 102 | pmfs_transaction_t *trans) 103 | { 104 | pmfs_logentry_t *le; 105 | int i; 106 | uint16_t gen_id = trans->gen_id; 107 | 108 | le = trans->start_addr + trans->num_used; 109 | le--; 110 | for (i = trans->num_used - 1; i >= 0; i--, le--) { 111 | if (gen_id == le16_to_cpu(le->gen_id)) 112 | pmfs_undo_logentry(sb, le); 113 | } 114 | } 115 | 116 | /* can be called by either during log cleaning or during journal recovery */ 117 | static void pmfs_flush_transaction(struct super_block *sb, 118 | pmfs_transaction_t *trans) 119 | { 120 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 121 | pmfs_logentry_t *le = trans->start_addr; 122 | int i; 123 | char *data; 124 | 125 | for (i = 0; i < trans->num_used; i++, le++) { 126 | if (le->size) { 127 | data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset)); 128 | if (sbi->redo_log) { 129 | pmfs_memunlock_range(sb, data, le->size); 130 | memcpy(data, le->data, le->size); 131 | pmfs_memlock_range(sb, data, le->size); 132 | } else 133 | pmfs_flush_buffer(data, le->size, false); 134 | } 135 | } 136 | } 137 | 138 | static inline void invalidate_gen_id(pmfs_logentry_t *le) 139 | { 140 | le->gen_id = 0; 141 | pmfs_flush_buffer(le, LOGENTRY_SIZE, false); 142 | } 143 | 144 | /* can be called by either during log cleaning or during journal recovery */ 145 | static void pmfs_invalidate_logentries(struct super_block *sb, 146 | pmfs_transaction_t *trans) 147 | { 148 | pmfs_logentry_t *le = trans->start_addr; 149 | int i; 150 | 151 | pmfs_memunlock_range(sb, trans->start_addr, 152 | trans->num_entries * LOGENTRY_SIZE); 153 | for (i = 0; i < trans->num_entries; i++) { 154 | invalidate_gen_id(le); 155 | if (le->type == LE_START) { 156 | PERSISTENT_MARK(); 157 | PERSISTENT_BARRIER(); 158 | } 159 | le++; 160 | } 161 | pmfs_memlock_range(sb, trans->start_addr, 162 | trans->num_entries * LOGENTRY_SIZE); 163 | } 164 | 165 | /* can be called by either during log cleaning or during journal recovery */ 166 | static void pmfs_redo_transaction(struct super_block *sb, 167 | pmfs_transaction_t *trans, bool recover) 168 | { 169 | pmfs_logentry_t *le = trans->start_addr; 170 | int i; 171 | uint16_t gen_id = trans->gen_id; 172 | char *data; 173 | 174 | for (i = 0; i < trans->num_entries; i++) { 175 | if (gen_id == le16_to_cpu(le->gen_id) && le->size > 0) { 176 | data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset)); 177 | /* flush data if we are called during recovery */ 178 | if (recover) { 179 | pmfs_memunlock_range(sb, data, le->size); 180 | memcpy(data, le->data, le->size); 181 | pmfs_memlock_range(sb, data, le->size); 182 | } 183 | pmfs_flush_buffer(data, le->size, false); 184 | } 185 | le++; 186 | } 187 | } 188 | 189 | /* recover the transaction ending at a valid log entry *le */ 190 | /* called for Undo log and traverses the journal backward */ 191 | static uint32_t pmfs_recover_transaction(struct super_block *sb, uint32_t head, 192 | uint32_t tail, pmfs_logentry_t *le) 193 | { 194 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 195 | pmfs_transaction_t trans; 196 | bool cmt_or_abrt_found = false, start_found = false; 197 | uint16_t gen_id = le16_to_cpu(le->gen_id); 198 | 199 | memset(&trans, 0, sizeof(trans)); 200 | trans.transaction_id = le32_to_cpu(le->transaction_id); 201 | trans.gen_id = gen_id; 202 | 203 | do { 204 | trans.num_entries++; 205 | trans.num_used++; 206 | 207 | if (gen_id == le16_to_cpu(le->gen_id)) { 208 | /* Handle committed/aborted transactions */ 209 | if (le->type & LE_COMMIT || le->type & LE_ABORT) 210 | cmt_or_abrt_found = true; 211 | if (le->type & LE_START) { 212 | trans.start_addr = le; 213 | start_found = true; 214 | break; 215 | } 216 | } 217 | if (tail == 0 || tail == head) 218 | break; 219 | /* prev log entry */ 220 | le--; 221 | /* Handle uncommitted transactions */ 222 | if ((gen_id == le16_to_cpu(le->gen_id)) 223 | && (le->type & LE_COMMIT || le->type & LE_ABORT)) { 224 | BUG_ON(trans.transaction_id == 225 | le32_to_cpu(le->transaction_id)); 226 | le++; 227 | break; 228 | } 229 | tail = prev_log_entry(sbi->jsize, tail); 230 | } while (1); 231 | 232 | if (start_found && !cmt_or_abrt_found) 233 | pmfs_undo_transaction(sb, &trans); 234 | 235 | if (gen_id == MAX_GEN_ID) { 236 | if (!start_found) 237 | trans.start_addr = le; 238 | /* make sure the changes made by pmfs_undo_transaction() are 239 | * persistent before invalidating the log entries */ 240 | if (start_found && !cmt_or_abrt_found) { 241 | PERSISTENT_MARK(); 242 | PERSISTENT_BARRIER(); 243 | } 244 | pmfs_invalidate_logentries(sb, &trans); 245 | } 246 | return tail; 247 | } 248 | 249 | /* process the transaction starting at a valid log entry *le */ 250 | /* called by the log cleaner and journal recovery */ 251 | static uint32_t pmfs_process_transaction(struct super_block *sb, uint32_t head, 252 | uint32_t tail, pmfs_logentry_t *le, bool recover, int *processed) 253 | { 254 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 255 | pmfs_transaction_t trans; 256 | uint16_t gen_id; 257 | uint32_t new_head = head; 258 | int handled = 0; 259 | 260 | *processed = 0; 261 | gen_id = le16_to_cpu(le->gen_id); 262 | if (!(le->type & LE_START)) { 263 | pmfs_dbg("start of trans %x but LE_START not set. gen_id %d\n", 264 | le32_to_cpu(le->transaction_id), gen_id); 265 | return next_log_entry(sbi->jsize, new_head); 266 | } 267 | memset(&trans, 0, sizeof(trans)); 268 | trans.transaction_id = le32_to_cpu(le->transaction_id); 269 | trans.start_addr = le; 270 | trans.gen_id = gen_id; 271 | do { 272 | trans.num_entries++; 273 | trans.num_used++; 274 | new_head = next_log_entry(sbi->jsize, new_head); 275 | handled++; 276 | 277 | /* Handle committed/aborted transactions */ 278 | if ((gen_id == le16_to_cpu(le->gen_id)) && (le->type & LE_COMMIT 279 | || le->type & LE_ABORT)) { 280 | head = new_head; 281 | if ((le->type & LE_COMMIT) && sbi->redo_log) 282 | pmfs_redo_transaction(sb, &trans, recover); 283 | 284 | if (gen_id == MAX_GEN_ID) { 285 | if ((le->type & LE_COMMIT) && sbi->redo_log) { 286 | PERSISTENT_MARK(); 287 | PERSISTENT_BARRIER(); 288 | } 289 | pmfs_invalidate_logentries(sb, &trans); 290 | } 291 | break; 292 | } 293 | /* next log entry */ 294 | le++; 295 | /* Handle uncommitted transactions */ 296 | if ((new_head == tail) || ((gen_id == le16_to_cpu(le->gen_id)) 297 | && (le->type & LE_START))) { 298 | /* found a new valid transaction w/o finding a commit */ 299 | if (recover) { 300 | /* if this function is called by recovery, move 301 | * ahead even if we didn't find a commit record 302 | * for this transaction */ 303 | head = new_head; 304 | if (gen_id == MAX_GEN_ID) 305 | pmfs_invalidate_logentries(sb, &trans); 306 | } 307 | pmfs_dbg_trans("no cmt tid %d sa %p nle %d tail %x" 308 | " gen %d\n", 309 | trans.transaction_id,trans.start_addr,trans.num_entries, 310 | trans.num_used, trans.gen_id); 311 | /* dump_transaction(sbi, &trans); */ 312 | break; 313 | } 314 | } while (new_head != tail); 315 | 316 | *processed = handled; 317 | return head; 318 | } 319 | 320 | static int pmfs_clean_journal(struct super_block *sb, bool unmount, 321 | int take_lock) 322 | { 323 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 324 | pmfs_journal_t *journal = pmfs_get_journal(sb); 325 | uint32_t head; 326 | uint32_t new_head, tail; 327 | uint16_t gen_id; 328 | volatile __le64 *ptr_tail_genid; 329 | int processed = 0; 330 | int total = 0; 331 | u64 tail_genid; 332 | pmfs_logentry_t *le; 333 | 334 | if (take_lock) 335 | mutex_lock(&sbi->journal_mutex); 336 | head = le32_to_cpu(journal->head); 337 | ptr_tail_genid = (volatile __le64 *)&journal->tail; 338 | 339 | /* atomically read both tail and gen_id of journal. Normally use of 340 | * volatile is prohibited in kernel code but since we use volatile 341 | * to write to journal's tail and gen_id atomically, we thought we 342 | * should use volatile to read them simultaneously and avoid locking 343 | * them. */ 344 | tail_genid = le64_to_cpu(*ptr_tail_genid); 345 | tail = tail_genid & 0xFFFFFFFF; 346 | gen_id = (tail_genid >> 32) & 0xFFFF; 347 | 348 | /* journal wraparound happened. so head points to prev generation id */ 349 | if (tail < head) 350 | gen_id = prev_gen_id(gen_id); 351 | pmfs_dbg_trans("starting journal cleaning %x %x\n", head, tail); 352 | while (head != tail) { 353 | le = (pmfs_logentry_t *)(sbi->journal_base_addr + head); 354 | if (gen_id == le16_to_cpu(le->gen_id)) { 355 | /* found a valid log entry, process the transaction */ 356 | new_head = pmfs_process_transaction(sb, head, tail, 357 | le, false, &processed); 358 | total += processed; 359 | /* no progress was made. return */ 360 | if (new_head == head) 361 | break; 362 | head = new_head; 363 | } else { 364 | if (gen_id == MAX_GEN_ID) { 365 | pmfs_memunlock_range(sb, le, sizeof(*le)); 366 | invalidate_gen_id(le); 367 | pmfs_memlock_range(sb, le, sizeof(*le)); 368 | } 369 | head = next_log_entry(sbi->jsize, head); 370 | } 371 | /* handle journal wraparound */ 372 | if (head == 0) 373 | gen_id = next_gen_id(gen_id); 374 | } 375 | PERSISTENT_MARK(); 376 | PERSISTENT_BARRIER(); 377 | pmfs_memunlock_range(sb, journal, sizeof(*journal)); 378 | journal->head = cpu_to_le32(head); 379 | pmfs_memlock_range(sb, journal, sizeof(*journal)); 380 | pmfs_flush_buffer(&journal->head, sizeof(journal->head), true); 381 | if (unmount) { 382 | PERSISTENT_MARK(); 383 | if (journal->head != journal->tail) 384 | pmfs_dbg("PMFS: umount but journal not empty %x:%x\n", 385 | le32_to_cpu(journal->head), le32_to_cpu(journal->tail)); 386 | PERSISTENT_BARRIER(); 387 | } 388 | pmfs_dbg_trans("leaving journal cleaning %x %x\n", head, tail); 389 | if (take_lock) 390 | mutex_unlock(&sbi->journal_mutex); 391 | return total; 392 | } 393 | 394 | static void log_cleaner_try_sleeping(struct pmfs_sb_info *sbi) 395 | { 396 | DEFINE_WAIT(wait); 397 | prepare_to_wait(&sbi->log_cleaner_wait, &wait, TASK_INTERRUPTIBLE); 398 | schedule(); 399 | finish_wait(&sbi->log_cleaner_wait, &wait); 400 | } 401 | 402 | static int pmfs_log_cleaner(void *arg) 403 | { 404 | struct super_block *sb = (struct super_block *)arg; 405 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 406 | 407 | pmfs_dbg_trans("Running log cleaner thread\n"); 408 | for ( ; ; ) { 409 | log_cleaner_try_sleeping(sbi); 410 | 411 | if (kthread_should_stop()) 412 | break; 413 | 414 | pmfs_clean_journal(sb, false, 1); 415 | } 416 | pmfs_clean_journal(sb, true, 1); 417 | pmfs_dbg_trans("Exiting log cleaner thread\n"); 418 | return 0; 419 | } 420 | 421 | static int pmfs_journal_cleaner_run(struct super_block *sb) 422 | { 423 | int ret = 0; 424 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 425 | 426 | init_waitqueue_head(&sbi->log_cleaner_wait); 427 | 428 | sbi->log_cleaner_thread = kthread_run(pmfs_log_cleaner, sb, 429 | "pmfs_log_cleaner_0x%llx", sbi->phys_addr); 430 | if (IS_ERR(sbi->log_cleaner_thread)) { 431 | /* failure at boot is fatal */ 432 | pmfs_err(sb, "Failed to start pmfs log cleaner thread\n"); 433 | ret = -1; 434 | } 435 | return ret; 436 | } 437 | 438 | int pmfs_journal_soft_init(struct super_block *sb) 439 | { 440 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 441 | pmfs_journal_t *journal = pmfs_get_journal(sb); 442 | 443 | sbi->next_transaction_id = 0; 444 | sbi->journal_base_addr = pmfs_get_block(sb,le64_to_cpu(journal->base)); 445 | sbi->jsize = le32_to_cpu(journal->size); 446 | mutex_init(&sbi->journal_mutex); 447 | sbi->redo_log = !!le16_to_cpu(journal->redo_logging); 448 | 449 | return pmfs_journal_cleaner_run(sb); 450 | } 451 | 452 | int pmfs_journal_hard_init(struct super_block *sb, uint64_t base, 453 | uint32_t size) 454 | { 455 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 456 | pmfs_journal_t *journal = pmfs_get_journal(sb); 457 | 458 | pmfs_memunlock_range(sb, journal, sizeof(*journal)); 459 | journal->base = cpu_to_le64(base); 460 | journal->size = cpu_to_le32(size); 461 | journal->gen_id = cpu_to_le16(1); 462 | journal->head = journal->tail = 0; 463 | /* lets do Undo logging for now */ 464 | journal->redo_logging = 0; 465 | pmfs_memlock_range(sb, journal, sizeof(*journal)); 466 | 467 | sbi->journal_base_addr = pmfs_get_block(sb, base); 468 | pmfs_memunlock_range(sb, sbi->journal_base_addr, size); 469 | memset_nt(sbi->journal_base_addr, 0, size); 470 | pmfs_memlock_range(sb, sbi->journal_base_addr, size); 471 | 472 | return pmfs_journal_soft_init(sb); 473 | } 474 | 475 | static void wakeup_log_cleaner(struct pmfs_sb_info *sbi) 476 | { 477 | if (!waitqueue_active(&sbi->log_cleaner_wait)) 478 | return; 479 | pmfs_dbg_trans("waking up the cleaner thread\n"); 480 | wake_up_interruptible(&sbi->log_cleaner_wait); 481 | } 482 | 483 | int pmfs_journal_uninit(struct super_block *sb) 484 | { 485 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 486 | 487 | if (sbi->log_cleaner_thread) 488 | kthread_stop(sbi->log_cleaner_thread); 489 | return 0; 490 | } 491 | 492 | inline pmfs_transaction_t *pmfs_current_transaction(void) 493 | { 494 | return (pmfs_transaction_t *)current->journal_info; 495 | } 496 | 497 | static int pmfs_free_logentries(struct super_block *sb, int max_log_entries) 498 | { 499 | int freed_entries = 0; 500 | 501 | freed_entries = pmfs_clean_journal(sb, false, 0); 502 | return LOGENTRY_SIZE * freed_entries; 503 | } 504 | 505 | pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb, 506 | int max_log_entries) 507 | { 508 | pmfs_journal_t *journal = pmfs_get_journal(sb); 509 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 510 | pmfs_transaction_t *trans; 511 | uint32_t head, tail, req_size, avail_size, freed_size; 512 | uint64_t base; 513 | int retry = 0; 514 | timing_t log_time; 515 | #if 0 516 | trans = pmfs_current_transaction(); 517 | 518 | if (trans) { 519 | BUG_ON(trans->t_journal != journal); 520 | return trans; 521 | } 522 | #endif 523 | /* If it is an undo log, need one more log-entry for commit record */ 524 | PMFS_START_TIMING(new_trans_t, log_time); 525 | 526 | if (!sbi->redo_log) 527 | max_log_entries++; 528 | 529 | trans = pmfs_alloc_transaction(); 530 | if (!trans) 531 | return ERR_PTR(-ENOMEM); 532 | memset(trans, 0, sizeof(*trans)); 533 | 534 | trans->num_used = 0; 535 | trans->num_entries = max_log_entries; 536 | trans->t_journal = journal; 537 | req_size = max_log_entries << LESIZE_SHIFT; 538 | 539 | mutex_lock(&sbi->journal_mutex); 540 | 541 | tail = le32_to_cpu(journal->tail); 542 | head = le32_to_cpu(journal->head); 543 | trans->transaction_id = sbi->next_transaction_id++; 544 | again: 545 | trans->gen_id = le16_to_cpu(journal->gen_id); 546 | avail_size = (tail >= head) ? 547 | (sbi->jsize - (tail - head)) : (head - tail); 548 | avail_size = avail_size - LOGENTRY_SIZE; 549 | 550 | if (avail_size < req_size) { 551 | /* run the log cleaner function to free some log entries */ 552 | freed_size = 0; 553 | for (retry = 0; retry < 3; retry++) { 554 | freed_size += pmfs_free_logentries(sb, 555 | max_log_entries); 556 | if ((avail_size + freed_size) >= req_size) 557 | break; 558 | } 559 | 560 | if ((avail_size + freed_size) < req_size) 561 | goto journal_full; 562 | } 563 | base = le64_to_cpu(journal->base) + tail; 564 | tail = tail + req_size; 565 | /* journal wraparound because of this transaction allocation. 566 | * start the transaction from the beginning of the journal so 567 | * that we don't have any wraparound within a transaction */ 568 | pmfs_memunlock_range(sb, journal, sizeof(*journal)); 569 | if (tail >= sbi->jsize) { 570 | u64 *ptr; 571 | tail = 0; 572 | ptr = (u64 *)&journal->tail; 573 | /* writing 8-bytes atomically setting tail to 0 */ 574 | set_64bit(ptr, (__force u64)cpu_to_le64((u64)next_gen_id( 575 | le16_to_cpu(journal->gen_id)) << 32)); 576 | pmfs_memlock_range(sb, journal, sizeof(*journal)); 577 | pmfs_dbg_trans("journal wrapped. tail %x gid %d cur tid %d\n", 578 | le32_to_cpu(journal->tail),le16_to_cpu(journal->gen_id), 579 | sbi->next_transaction_id - 1); 580 | goto again; 581 | } else { 582 | journal->tail = cpu_to_le32(tail); 583 | pmfs_memlock_range(sb, journal, sizeof(*journal)); 584 | } 585 | pmfs_flush_buffer(&journal->tail, sizeof(u64), false); 586 | mutex_unlock(&sbi->journal_mutex); 587 | 588 | avail_size = avail_size - req_size; 589 | /* wake up the log cleaner if required */ 590 | if ((sbi->jsize - avail_size) > (sbi->jsize >> 3)) 591 | wakeup_log_cleaner(sbi); 592 | 593 | pmfs_dbg_trans("new transaction tid %d nle %d avl sz %x sa %llx\n", 594 | trans->transaction_id, max_log_entries, avail_size, base); 595 | trans->start_addr = pmfs_get_block(sb, base); 596 | 597 | trans->parent = (pmfs_transaction_t *)current->journal_info; 598 | current->journal_info = trans; 599 | PMFS_END_TIMING(new_trans_t, log_time); 600 | return trans; 601 | journal_full: 602 | mutex_unlock(&sbi->journal_mutex); 603 | pmfs_err(sb, "Journal full. base %llx sz %x head:tail %x:%x ncl %x\n", 604 | le64_to_cpu(journal->base), le32_to_cpu(journal->size), 605 | le32_to_cpu(journal->head), le32_to_cpu(journal->tail), 606 | max_log_entries); 607 | pmfs_err(sb, "avail size %u, freed size %u, request size %u\n", 608 | avail_size, freed_size, req_size); 609 | pmfs_free_transaction(trans); 610 | PMFS_END_TIMING(new_trans_t, log_time); 611 | return ERR_PTR(-EAGAIN); 612 | } 613 | 614 | static inline void pmfs_commit_logentry(struct super_block *sb, 615 | pmfs_transaction_t *trans, pmfs_logentry_t *le) 616 | { 617 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 618 | if (sbi->redo_log) { 619 | /* Redo Log */ 620 | PERSISTENT_MARK(); 621 | PERSISTENT_BARRIER(); 622 | /* Atomically write the commit type */ 623 | le->type |= LE_COMMIT; 624 | barrier(); 625 | /* Atomically make the log entry valid */ 626 | le->gen_id = cpu_to_le16(trans->gen_id); 627 | pmfs_flush_buffer(le, LOGENTRY_SIZE, false); 628 | PERSISTENT_MARK(); 629 | PERSISTENT_BARRIER(); 630 | /* Update the FS in place */ 631 | pmfs_flush_transaction(sb, trans); 632 | } else { 633 | /* Undo Log */ 634 | /* Update the FS in place: currently already done. so 635 | * only need to clflush */ 636 | pmfs_flush_transaction(sb, trans); 637 | PERSISTENT_MARK(); 638 | PERSISTENT_BARRIER(); 639 | /* Atomically write the commit type */ 640 | le->type |= LE_COMMIT; 641 | barrier(); 642 | /* Atomically make the log entry valid */ 643 | le->gen_id = cpu_to_le16(trans->gen_id); 644 | pmfs_flush_buffer(le, LOGENTRY_SIZE, true); 645 | } 646 | } 647 | 648 | int pmfs_add_logentry(struct super_block *sb, 649 | pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type) 650 | { 651 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 652 | pmfs_logentry_t *le; 653 | int num_les = 0, i; 654 | uint64_t le_start = size ? pmfs_get_addr_off(sbi, addr) : 0; 655 | uint8_t le_size; 656 | timing_t add_log_time; 657 | 658 | if (trans == NULL) 659 | return -EINVAL; 660 | 661 | PMFS_START_TIMING(add_log_t, add_log_time); 662 | le = trans->start_addr + trans->num_used; 663 | 664 | if (size == 0) { 665 | /* At least one log entry required for commit/abort log entry */ 666 | if ((type & LE_COMMIT) || (type & LE_ABORT)) 667 | num_les = 1; 668 | } else 669 | num_les = (size + sizeof(le->data) - 1)/sizeof(le->data); 670 | 671 | pmfs_dbg_trans("add le id %d size %x, num_les %d tail %x le %p\n", 672 | trans->transaction_id, size, trans->num_entries, 673 | trans->num_used, le); 674 | 675 | if ((trans->num_used + num_les) > trans->num_entries) { 676 | pmfs_err(sb, "Log Entry full. tid %x ne %x tail %x size %x\n", 677 | trans->transaction_id, trans->num_entries, 678 | trans->num_used, size); 679 | dump_transaction(sbi, trans); 680 | dump_stack(); 681 | return -ENOMEM; 682 | } 683 | 684 | pmfs_memunlock_range(sb, le, sizeof(*le) * num_les); 685 | for (i = 0; i < num_les; i++) { 686 | le->addr_offset = cpu_to_le64(le_start); 687 | le->transaction_id = cpu_to_le32(trans->transaction_id); 688 | le_size = (i == (num_les - 1)) ? size : sizeof(le->data); 689 | le->size = le_size; 690 | size -= le_size; 691 | if (le_size) 692 | memcpy(le->data, addr, le_size); 693 | le->type = type; 694 | 695 | if (i == 0 && trans->num_used == 0) 696 | le->type |= LE_START; 697 | trans->num_used++; 698 | 699 | /* handle special log entry */ 700 | if (i == (num_les - 1) && (type & LE_COMMIT)) { 701 | pmfs_commit_logentry(sb, trans, le); 702 | pmfs_memlock_range(sb, le, sizeof(*le) * num_les); 703 | PMFS_END_TIMING(add_log_t, add_log_time); 704 | return 0; 705 | } 706 | /* put a compile time barrier so that compiler doesn't reorder 707 | * the writes to the log entry */ 708 | barrier(); 709 | 710 | /* Atomically make the log entry valid */ 711 | le->gen_id = cpu_to_le16(trans->gen_id); 712 | pmfs_flush_buffer(le, LOGENTRY_SIZE, false); 713 | 714 | addr += le_size; 715 | le_start += le_size; 716 | le++; 717 | } 718 | pmfs_memlock_range(sb, le, sizeof(*le) * num_les); 719 | if (!sbi->redo_log) { 720 | PERSISTENT_MARK(); 721 | PERSISTENT_BARRIER(); 722 | } 723 | PMFS_END_TIMING(add_log_t, add_log_time); 724 | return 0; 725 | } 726 | 727 | int pmfs_commit_transaction(struct super_block *sb, 728 | pmfs_transaction_t *trans) 729 | { 730 | timing_t commit_time; 731 | 732 | if (trans == NULL) 733 | return 0; 734 | /* Add the commit log-entry */ 735 | pmfs_add_logentry(sb, trans, NULL, 0, LE_COMMIT); 736 | 737 | PMFS_START_TIMING(commit_trans_t, commit_time); 738 | pmfs_dbg_trans("completing transaction for id %d\n", 739 | trans->transaction_id); 740 | 741 | current->journal_info = trans->parent; 742 | pmfs_free_transaction(trans); 743 | PMFS_END_TIMING(commit_trans_t, commit_time); 744 | return 0; 745 | } 746 | 747 | int pmfs_abort_transaction(struct super_block *sb, pmfs_transaction_t *trans) 748 | { 749 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 750 | 751 | if (trans == NULL) 752 | return 0; 753 | pmfs_dbg_trans("abort trans for tid %x sa %p numle %d tail %x gen %d\n", 754 | trans->transaction_id, trans->start_addr, trans->num_entries, 755 | trans->num_used, trans->gen_id); 756 | dump_transaction(sbi, trans); 757 | /*dump_stack();*/ 758 | 759 | if (!sbi->redo_log) { 760 | /* Undo Log */ 761 | pmfs_undo_transaction(sb, trans); 762 | PERSISTENT_MARK(); 763 | PERSISTENT_BARRIER(); 764 | } 765 | /* add a abort log entry */ 766 | pmfs_add_logentry(sb, trans, NULL, 0, LE_ABORT); 767 | current->journal_info = trans->parent; 768 | pmfs_free_transaction(trans); 769 | return 0; 770 | } 771 | 772 | static void invalidate_remaining_journal(struct super_block *sb, 773 | void *journal_vaddr, uint32_t jtail, uint32_t jsize) 774 | { 775 | pmfs_logentry_t *le = (pmfs_logentry_t *)(journal_vaddr + jtail); 776 | void *start = le; 777 | 778 | pmfs_memunlock_range(sb, start, jsize - jtail); 779 | while (jtail < jsize) { 780 | invalidate_gen_id(le); 781 | le++; 782 | jtail += LOGENTRY_SIZE; 783 | } 784 | pmfs_memlock_range(sb, start, jsize - jtail); 785 | } 786 | 787 | /* we need to increase the gen_id to invalidate all the journal log 788 | * entries. This is because after the recovery, we may still have some 789 | * valid log entries beyond the tail (before power failure, they became 790 | * persistent before the journal tail could become persistent. 791 | * should gen_id and head be updated atomically? not necessarily? we 792 | * can update gen_id before journal head because gen_id and head are in 793 | * the same cacheline */ 794 | static void pmfs_forward_journal(struct super_block *sb, struct pmfs_sb_info 795 | *sbi, pmfs_journal_t *journal) 796 | { 797 | uint16_t gen_id = le16_to_cpu(journal->gen_id); 798 | /* handle gen_id wrap around */ 799 | if (gen_id == MAX_GEN_ID) { 800 | invalidate_remaining_journal(sb, sbi->journal_base_addr, 801 | le32_to_cpu(journal->tail), sbi->jsize); 802 | } 803 | PERSISTENT_MARK(); 804 | gen_id = next_gen_id(gen_id); 805 | /* make all changes persistent before advancing gen_id and head */ 806 | PERSISTENT_BARRIER(); 807 | pmfs_memunlock_range(sb, journal, sizeof(*journal)); 808 | journal->gen_id = cpu_to_le16(gen_id); 809 | barrier(); 810 | journal->head = journal->tail; 811 | pmfs_memlock_range(sb, journal, sizeof(*journal)); 812 | pmfs_flush_buffer(journal, sizeof(*journal), false); 813 | } 814 | 815 | static int pmfs_recover_undo_journal(struct super_block *sb) 816 | { 817 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 818 | pmfs_journal_t *journal = pmfs_get_journal(sb); 819 | uint32_t tail = le32_to_cpu(journal->tail); 820 | uint32_t head = le32_to_cpu(journal->head); 821 | uint16_t gen_id = le16_to_cpu(journal->gen_id); 822 | pmfs_logentry_t *le; 823 | 824 | while (head != tail) { 825 | /* handle journal wraparound */ 826 | if (tail == 0) 827 | gen_id = prev_gen_id(gen_id); 828 | tail = prev_log_entry(sbi->jsize, tail); 829 | 830 | le = (pmfs_logentry_t *)(sbi->journal_base_addr + tail); 831 | if (gen_id == le16_to_cpu(le->gen_id)) { 832 | tail = pmfs_recover_transaction(sb, head, tail, le); 833 | } else { 834 | if (gen_id == MAX_GEN_ID) { 835 | pmfs_memunlock_range(sb, le, sizeof(*le)); 836 | invalidate_gen_id(le); 837 | pmfs_memlock_range(sb, le, sizeof(*le)); 838 | } 839 | } 840 | } 841 | pmfs_forward_journal(sb, sbi, journal); 842 | PERSISTENT_MARK(); 843 | PERSISTENT_BARRIER(); 844 | return 0; 845 | } 846 | 847 | static int pmfs_recover_redo_journal(struct super_block *sb) 848 | { 849 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 850 | pmfs_journal_t *journal = pmfs_get_journal(sb); 851 | uint32_t tail = le32_to_cpu(journal->tail); 852 | uint32_t head = le32_to_cpu(journal->head); 853 | uint16_t gen_id = le16_to_cpu(journal->gen_id); 854 | int processed = 0; 855 | pmfs_logentry_t *le; 856 | 857 | /* journal wrapped around. so head points to previous generation id */ 858 | if (tail < head) 859 | gen_id = prev_gen_id(gen_id); 860 | 861 | while (head != tail) { 862 | le = (pmfs_logentry_t *)(sbi->journal_base_addr + head); 863 | if (gen_id == le16_to_cpu(le->gen_id)) { 864 | head = pmfs_process_transaction(sb, head, tail, 865 | le, true, &processed); 866 | } else { 867 | if (gen_id == MAX_GEN_ID) { 868 | pmfs_memunlock_range(sb, le, sizeof(*le)); 869 | invalidate_gen_id(le); 870 | pmfs_memlock_range(sb, le, sizeof(*le)); 871 | } 872 | head = next_log_entry(sbi->jsize, head); 873 | } 874 | /* handle journal wraparound */ 875 | if (head == 0) 876 | gen_id = next_gen_id(gen_id); 877 | } 878 | pmfs_forward_journal(sb, sbi, journal); 879 | PERSISTENT_MARK(); 880 | PERSISTENT_BARRIER(); 881 | return 0; 882 | } 883 | 884 | int pmfs_recover_journal(struct super_block *sb) 885 | { 886 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 887 | pmfs_journal_t *journal = pmfs_get_journal(sb); 888 | uint32_t tail = le32_to_cpu(journal->tail); 889 | uint32_t head = le32_to_cpu(journal->head); 890 | uint16_t gen_id = le16_to_cpu(journal->gen_id); 891 | 892 | /* is the journal empty? true if unmounted properly. */ 893 | if (head == tail) 894 | return 0; 895 | pmfs_dbg("PMFS: journal recovery. head:tail %x:%x gen_id %d\n", 896 | head, tail, gen_id); 897 | if (sbi->redo_log) 898 | pmfs_recover_redo_journal(sb); 899 | else 900 | pmfs_recover_undo_journal(sb); 901 | return 0; 902 | } 903 | 904 | -------------------------------------------------------------------------------- /super.c: -------------------------------------------------------------------------------- 1 | /* 2 | * BRIEF DESCRIPTION 3 | * 4 | * Super block operations. 5 | * 6 | * Copyright 2012-2013 Intel Corporation 7 | * Copyright 2009-2011 Marco Stornelli 8 | * Copyright 2003 Sony Corporation 9 | * Copyright 2003 Matsushita Electric Industrial Co., Ltd. 10 | * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam 11 | * This file is licensed under the terms of the GNU General Public 12 | * License version 2. This program is licensed "as is" without any 13 | * warranty of any kind, whether express or implied. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include "pmfs.h" 37 | 38 | int measure_timing = 0; 39 | int support_clwb = 0; 40 | int support_pcommit = 0; 41 | 42 | module_param(measure_timing, int, S_IRUGO); 43 | MODULE_PARM_DESC(measure_timing, "Timing measurement"); 44 | 45 | static struct super_operations pmfs_sops; 46 | static const struct export_operations pmfs_export_ops; 47 | static struct kmem_cache *pmfs_inode_cachep; 48 | static struct kmem_cache *pmfs_blocknode_cachep; 49 | static struct kmem_cache *pmfs_transaction_cachep; 50 | /* FIXME: should the following variable be one per PMFS instance? */ 51 | unsigned int pmfs_dbgmask = 0; 52 | 53 | #ifdef CONFIG_PMFS_TEST 54 | static void *first_pmfs_super; 55 | 56 | struct pmfs_super_block *get_pmfs_super(void) 57 | { 58 | return (struct pmfs_super_block *)first_pmfs_super; 59 | } 60 | EXPORT_SYMBOL(get_pmfs_super); 61 | #endif 62 | 63 | void pmfs_error_mng(struct super_block *sb, const char *fmt, ...) 64 | { 65 | va_list args; 66 | 67 | printk("pmfs error: "); 68 | va_start(args, fmt); 69 | vprintk(fmt, args); 70 | va_end(args); 71 | 72 | if (test_opt(sb, ERRORS_PANIC)) 73 | panic("pmfs: panic from previous error\n"); 74 | if (test_opt(sb, ERRORS_RO)) { 75 | printk(KERN_CRIT "pmfs err: remounting filesystem read-only"); 76 | sb->s_flags |= MS_RDONLY; 77 | } 78 | } 79 | 80 | static void pmfs_set_blocksize(struct super_block *sb, unsigned long size) 81 | { 82 | int bits; 83 | 84 | /* 85 | * We've already validated the user input and the value here must be 86 | * between PMFS_MAX_BLOCK_SIZE and PMFS_MIN_BLOCK_SIZE 87 | * and it must be a power of 2. 88 | */ 89 | bits = fls(size) - 1; 90 | sb->s_blocksize_bits = bits; 91 | sb->s_blocksize = (1 << bits); 92 | } 93 | 94 | static inline int pmfs_has_huge_ioremap(struct super_block *sb) 95 | { 96 | struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info; 97 | 98 | return sbi->s_mount_opt & PMFS_MOUNT_HUGEIOREMAP; 99 | } 100 | 101 | static int pmfs_get_block_info(struct super_block *sb, 102 | struct pmfs_sb_info *sbi) 103 | { 104 | struct dax_device *dax_dev; 105 | void *virt_addr = NULL; 106 | pfn_t __pfn_t; 107 | long size; 108 | int ret; 109 | 110 | ret = bdev_dax_supported(sb, PAGE_SIZE); 111 | if (ret) { 112 | pmfs_err(sb, "device does not support DAX: %d\n", ret); 113 | return ret; 114 | } 115 | 116 | sbi->s_bdev = sb->s_bdev; 117 | dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name); 118 | if (!dax_dev) { 119 | pmfs_err(sb, "Couldn't retrieve DAX device\n"); 120 | return -EINVAL; 121 | } 122 | 123 | size = dax_direct_access(dax_dev, 0, LONG_MAX / PAGE_SIZE, 124 | &virt_addr, &__pfn_t) * PAGE_SIZE; 125 | if (size <= 0) { 126 | pmfs_err(sb, "direct_access failed\n"); 127 | return -EINVAL; 128 | } 129 | 130 | sbi->virt_addr = virt_addr; 131 | sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT; 132 | sbi->initsize = size; 133 | 134 | return 0; 135 | } 136 | 137 | static loff_t pmfs_max_size(int bits) 138 | { 139 | loff_t res; 140 | 141 | res = (1ULL << (3 * 9 + bits)) - 1; 142 | 143 | if (res > MAX_LFS_FILESIZE) 144 | res = MAX_LFS_FILESIZE; 145 | 146 | pmfs_dbg_verbose("max file size %llu bytes\n", res); 147 | return res; 148 | } 149 | 150 | enum { 151 | Opt_bpi, Opt_init, Opt_jsize, 152 | Opt_num_inodes, Opt_mode, Opt_uid, 153 | Opt_gid, Opt_blocksize, Opt_wprotect, Opt_wprotectold, 154 | Opt_err_cont, Opt_err_panic, Opt_err_ro, 155 | Opt_hugemmap, Opt_nohugeioremap, Opt_dbgmask, Opt_bs, Opt_err 156 | }; 157 | 158 | static const match_table_t tokens = { 159 | { Opt_bpi, "bpi=%u" }, 160 | { Opt_init, "init" }, 161 | { Opt_jsize, "jsize=%s" }, 162 | { Opt_num_inodes,"num_inodes=%u" }, 163 | { Opt_mode, "mode=%o" }, 164 | { Opt_uid, "uid=%u" }, 165 | { Opt_gid, "gid=%u" }, 166 | { Opt_wprotect, "wprotect" }, 167 | { Opt_wprotectold, "wprotectold" }, 168 | { Opt_err_cont, "errors=continue" }, 169 | { Opt_err_panic, "errors=panic" }, 170 | { Opt_err_ro, "errors=remount-ro" }, 171 | { Opt_hugemmap, "hugemmap" }, 172 | { Opt_nohugeioremap, "nohugeioremap" }, 173 | { Opt_dbgmask, "dbgmask=%u" }, 174 | { Opt_bs, "backing_dev=%s" }, 175 | { Opt_err, NULL }, 176 | }; 177 | 178 | static int pmfs_parse_options(char *options, struct pmfs_sb_info *sbi, 179 | bool remount) 180 | { 181 | char *p, *rest; 182 | substring_t args[MAX_OPT_ARGS]; 183 | int option; 184 | 185 | if (!options) 186 | return 0; 187 | 188 | while ((p = strsep(&options, ",")) != NULL) { 189 | int token; 190 | if (!*p) 191 | continue; 192 | 193 | token = match_token(p, tokens, args); 194 | switch (token) { 195 | case Opt_bpi: 196 | if (remount) 197 | goto bad_opt; 198 | if (match_int(&args[0], &option)) 199 | goto bad_val; 200 | sbi->bpi = option; 201 | break; 202 | case Opt_uid: 203 | if (remount) 204 | goto bad_opt; 205 | if (match_int(&args[0], &option)) 206 | goto bad_val; 207 | sbi->uid = make_kuid(current_user_ns(), option); 208 | break; 209 | case Opt_gid: 210 | if (match_int(&args[0], &option)) 211 | goto bad_val; 212 | sbi->gid = make_kgid(current_user_ns(), option); 213 | break; 214 | case Opt_mode: 215 | if (match_octal(&args[0], &option)) 216 | goto bad_val; 217 | sbi->mode = option & 01777U; 218 | break; 219 | case Opt_init: 220 | if (remount) 221 | goto bad_opt; 222 | set_opt(sbi->s_mount_opt, FORMAT); 223 | break; 224 | case Opt_jsize: 225 | if (remount) 226 | goto bad_opt; 227 | /* memparse() will accept a K/M/G without a digit */ 228 | if (!isdigit(*args[0].from)) 229 | goto bad_val; 230 | sbi->jsize = memparse(args[0].from, &rest); 231 | /* make sure journal size is integer power of 2 */ 232 | if (sbi->jsize & (sbi->jsize - 1) || 233 | sbi->jsize < PMFS_MINIMUM_JOURNAL_SIZE) { 234 | pmfs_dbg("Invalid jsize: " 235 | "must be whole power of 2 & >= 64KB\n"); 236 | goto bad_val; 237 | } 238 | break; 239 | case Opt_num_inodes: 240 | if (remount) 241 | goto bad_opt; 242 | if (match_int(&args[0], &option)) 243 | goto bad_val; 244 | sbi->num_inodes = option; 245 | break; 246 | case Opt_err_panic: 247 | clear_opt(sbi->s_mount_opt, ERRORS_CONT); 248 | clear_opt(sbi->s_mount_opt, ERRORS_RO); 249 | set_opt(sbi->s_mount_opt, ERRORS_PANIC); 250 | break; 251 | case Opt_err_ro: 252 | clear_opt(sbi->s_mount_opt, ERRORS_CONT); 253 | clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 254 | set_opt(sbi->s_mount_opt, ERRORS_RO); 255 | break; 256 | case Opt_err_cont: 257 | clear_opt(sbi->s_mount_opt, ERRORS_RO); 258 | clear_opt(sbi->s_mount_opt, ERRORS_PANIC); 259 | set_opt(sbi->s_mount_opt, ERRORS_CONT); 260 | break; 261 | case Opt_wprotect: 262 | if (remount) 263 | goto bad_opt; 264 | set_opt(sbi->s_mount_opt, PROTECT); 265 | pmfs_info 266 | ("PMFS: Enabling new Write Protection (CR0.WP)\n"); 267 | break; 268 | case Opt_wprotectold: 269 | if (remount) 270 | goto bad_opt; 271 | set_opt(sbi->s_mount_opt, PROTECT_OLD); 272 | pmfs_info 273 | ("PMFS: Enabling old Write Protection (PAGE RW Bit)\n"); 274 | break; 275 | case Opt_hugemmap: 276 | if (remount) 277 | goto bad_opt; 278 | set_opt(sbi->s_mount_opt, HUGEMMAP); 279 | pmfs_info("PMFS: Enabling huge mappings for mmap\n"); 280 | break; 281 | case Opt_nohugeioremap: 282 | if (remount) 283 | goto bad_opt; 284 | clear_opt(sbi->s_mount_opt, HUGEIOREMAP); 285 | pmfs_info("PMFS: Disabling huge ioremap\n"); 286 | break; 287 | case Opt_dbgmask: 288 | if (match_int(&args[0], &option)) 289 | goto bad_val; 290 | pmfs_dbgmask = option; 291 | break; 292 | default: { 293 | goto bad_opt; 294 | } 295 | } 296 | } 297 | 298 | return 0; 299 | 300 | bad_val: 301 | printk(KERN_INFO "Bad value '%s' for mount option '%s'\n", args[0].from, 302 | p); 303 | return -EINVAL; 304 | bad_opt: 305 | printk(KERN_INFO "Bad mount option: \"%s\"\n", p); 306 | return -EINVAL; 307 | } 308 | 309 | static bool pmfs_check_size (struct super_block *sb, unsigned long size) 310 | { 311 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 312 | unsigned long minimum_size, num_blocks; 313 | 314 | /* space required for super block and root directory */ 315 | minimum_size = 2 << sb->s_blocksize_bits; 316 | 317 | /* space required for inode table */ 318 | if (sbi->num_inodes > 0) 319 | num_blocks = (sbi->num_inodes >> 320 | (sb->s_blocksize_bits - PMFS_INODE_BITS)) + 1; 321 | else 322 | num_blocks = 1; 323 | minimum_size += (num_blocks << sb->s_blocksize_bits); 324 | /* space required for journal */ 325 | minimum_size += sbi->jsize; 326 | 327 | if (size < minimum_size) 328 | return false; 329 | 330 | return true; 331 | } 332 | 333 | 334 | static struct pmfs_inode *pmfs_init(struct super_block *sb, 335 | unsigned long size) 336 | { 337 | unsigned long blocksize; 338 | u64 journal_meta_start, journal_data_start, inode_table_start; 339 | struct pmfs_inode *root_i; 340 | struct pmfs_super_block *super; 341 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 342 | struct pmfs_direntry *de; 343 | unsigned long blocknr; 344 | 345 | pmfs_info("creating an empty pmfs of size %lu\n", size); 346 | sbi->block_start = (unsigned long)0; 347 | sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT); 348 | sbi->num_free_blocks = ((unsigned long)(size) >> PAGE_SHIFT); 349 | 350 | if (!sbi->virt_addr) { 351 | printk(KERN_ERR "ioremap of the pmfs image failed(1)\n"); 352 | return ERR_PTR(-EINVAL); 353 | } 354 | #ifdef CONFIG_PMFS_TEST 355 | if (!first_pmfs_super) 356 | first_pmfs_super = sbi->virt_addr; 357 | #endif 358 | 359 | pmfs_dbg_verbose("pmfs: Default block size set to 4K\n"); 360 | blocksize = sbi->blocksize = PMFS_DEF_BLOCK_SIZE_4K; 361 | 362 | pmfs_set_blocksize(sb, blocksize); 363 | blocksize = sb->s_blocksize; 364 | 365 | if (sbi->blocksize && sbi->blocksize != blocksize) 366 | sbi->blocksize = blocksize; 367 | 368 | if (!pmfs_check_size(sb, size)) { 369 | pmfs_dbg("Specified PMFS size too small 0x%lx. Either increase" 370 | " PMFS size, or reduce num. of inodes (minimum 32)" 371 | " or journal size (minimum 64KB)\n", size); 372 | return ERR_PTR(-EINVAL); 373 | } 374 | 375 | journal_meta_start = sizeof(struct pmfs_super_block); 376 | journal_meta_start = (journal_meta_start + CACHELINE_SIZE - 1) & 377 | ~(CACHELINE_SIZE - 1); 378 | inode_table_start = journal_meta_start + sizeof(pmfs_journal_t); 379 | inode_table_start = (inode_table_start + CACHELINE_SIZE - 1) & 380 | ~(CACHELINE_SIZE - 1); 381 | 382 | if ((inode_table_start + sizeof(struct pmfs_inode)) > PMFS_SB_SIZE) { 383 | pmfs_dbg("PMFS super block defined too small. defined 0x%x, " 384 | "required 0x%llx\n", PMFS_SB_SIZE, 385 | inode_table_start + sizeof(struct pmfs_inode)); 386 | return ERR_PTR(-EINVAL); 387 | } 388 | 389 | journal_data_start = PMFS_SB_SIZE * 2; 390 | journal_data_start = (journal_data_start + blocksize - 1) & 391 | ~(blocksize - 1); 392 | 393 | pmfs_dbg_verbose("journal meta start %llx data start 0x%llx, " 394 | "journal size 0x%x, inode_table 0x%llx\n", journal_meta_start, 395 | journal_data_start, sbi->jsize, inode_table_start); 396 | pmfs_dbg_verbose("max file name len %d\n", (unsigned int)PMFS_NAME_LEN); 397 | 398 | super = pmfs_get_super(sb); 399 | pmfs_memunlock_range(sb, super, journal_data_start); 400 | 401 | /* clear out super-block and inode table */ 402 | memset_nt(super, 0, journal_data_start); 403 | super->s_size = cpu_to_le64(size); 404 | super->s_blocksize = cpu_to_le32(blocksize); 405 | super->s_magic = cpu_to_le16(PMFS_SUPER_MAGIC); 406 | super->s_journal_offset = cpu_to_le64(journal_meta_start); 407 | super->s_inode_table_offset = cpu_to_le64(inode_table_start); 408 | 409 | pmfs_init_blockmap(sb, journal_data_start + sbi->jsize); 410 | pmfs_memlock_range(sb, super, journal_data_start); 411 | 412 | if (pmfs_journal_hard_init(sb, journal_data_start, sbi->jsize) < 0) { 413 | printk(KERN_ERR "Journal hard initialization failed\n"); 414 | return ERR_PTR(-EINVAL); 415 | } 416 | 417 | if (pmfs_init_inode_table(sb) < 0) 418 | return ERR_PTR(-EINVAL); 419 | 420 | pmfs_memunlock_range(sb, super, PMFS_SB_SIZE*2); 421 | pmfs_sync_super(super); 422 | pmfs_memlock_range(sb, super, PMFS_SB_SIZE*2); 423 | 424 | pmfs_flush_buffer(super, PMFS_SB_SIZE, false); 425 | pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, sizeof(*super), false); 426 | 427 | pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1); 428 | 429 | root_i = pmfs_get_inode(sb, PMFS_ROOT_INO); 430 | 431 | pmfs_memunlock_inode(sb, root_i); 432 | root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR); 433 | root_i->i_uid = cpu_to_le32(from_kuid(&init_user_ns, sbi->uid)); 434 | root_i->i_gid = cpu_to_le32(from_kgid(&init_user_ns, sbi->gid)); 435 | root_i->i_links_count = cpu_to_le16(2); 436 | root_i->i_blk_type = PMFS_BLOCK_TYPE_4K; 437 | root_i->i_flags = 0; 438 | root_i->i_blocks = cpu_to_le64(1); 439 | root_i->i_size = cpu_to_le64(sb->s_blocksize); 440 | root_i->i_atime = root_i->i_mtime = root_i->i_ctime = 441 | cpu_to_le32(get_seconds()); 442 | root_i->root = cpu_to_le64(pmfs_get_block_off(sb, blocknr, 443 | PMFS_BLOCK_TYPE_4K)); 444 | root_i->height = 0; 445 | /* pmfs_sync_inode(root_i); */ 446 | pmfs_memlock_inode(sb, root_i); 447 | pmfs_flush_buffer(root_i, sizeof(*root_i), false); 448 | de = (struct pmfs_direntry *) 449 | pmfs_get_block(sb, pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K)); 450 | 451 | pmfs_memunlock_range(sb, de, sb->s_blocksize); 452 | de->ino = cpu_to_le64(PMFS_ROOT_INO); 453 | de->name_len = 1; 454 | de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len)); 455 | strcpy(de->name, "."); 456 | de = (struct pmfs_direntry *)((char *)de + le16_to_cpu(de->de_len)); 457 | de->ino = cpu_to_le64(PMFS_ROOT_INO); 458 | de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1)); 459 | de->name_len = 2; 460 | strcpy(de->name, ".."); 461 | pmfs_memlock_range(sb, de, sb->s_blocksize); 462 | pmfs_flush_buffer(de, PMFS_DIR_REC_LEN(2), false); 463 | PERSISTENT_MARK(); 464 | PERSISTENT_BARRIER(); 465 | return root_i; 466 | } 467 | 468 | static inline void set_default_opts(struct pmfs_sb_info *sbi) 469 | { 470 | /* set_opt(sbi->s_mount_opt, PROTECT); */ 471 | set_opt(sbi->s_mount_opt, HUGEIOREMAP); 472 | set_opt(sbi->s_mount_opt, ERRORS_CONT); 473 | sbi->jsize = PMFS_DEFAULT_JOURNAL_SIZE; 474 | } 475 | 476 | static void pmfs_root_check(struct super_block *sb, struct pmfs_inode *root_pi) 477 | { 478 | /* 479 | * if (root_pi->i_d.d_next) { 480 | * pmfs_warn("root->next not NULL, trying to fix\n"); 481 | * goto fail1; 482 | * } 483 | */ 484 | if (!S_ISDIR(le16_to_cpu(root_pi->i_mode))) 485 | pmfs_warn("root is not a directory!\n"); 486 | #if 0 487 | if (pmfs_calc_checksum((u8 *)root_pi, PMFS_INODE_SIZE)) { 488 | pmfs_dbg("checksum error in root inode, trying to fix\n"); 489 | goto fail3; 490 | } 491 | #endif 492 | } 493 | 494 | int pmfs_check_integrity(struct super_block *sb, 495 | struct pmfs_super_block *super) 496 | { 497 | struct pmfs_super_block *super_redund; 498 | 499 | super_redund = 500 | (struct pmfs_super_block *)((char *)super + PMFS_SB_SIZE); 501 | 502 | /* Do sanity checks on the superblock */ 503 | if (le16_to_cpu(super->s_magic) != PMFS_SUPER_MAGIC) { 504 | if (le16_to_cpu(super_redund->s_magic) != PMFS_SUPER_MAGIC) { 505 | printk(KERN_ERR "Can't find a valid pmfs partition\n"); 506 | goto out; 507 | } else { 508 | pmfs_warn 509 | ("Error in super block: try to repair it with " 510 | "the redundant copy"); 511 | /* Try to auto-recover the super block */ 512 | if (sb) 513 | pmfs_memunlock_super(sb, super); 514 | memcpy(super, super_redund, 515 | sizeof(struct pmfs_super_block)); 516 | if (sb) 517 | pmfs_memlock_super(sb, super); 518 | pmfs_flush_buffer(super, sizeof(*super), false); 519 | pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, 520 | sizeof(*super), false); 521 | 522 | } 523 | } 524 | 525 | /* Read the superblock */ 526 | if (pmfs_calc_checksum((u8 *)super, PMFS_SB_STATIC_SIZE(super))) { 527 | if (pmfs_calc_checksum((u8 *)super_redund, 528 | PMFS_SB_STATIC_SIZE(super_redund))) { 529 | printk(KERN_ERR "checksum error in super block\n"); 530 | goto out; 531 | } else { 532 | pmfs_warn 533 | ("Error in super block: try to repair it with " 534 | "the redundant copy"); 535 | /* Try to auto-recover the super block */ 536 | if (sb) 537 | pmfs_memunlock_super(sb, super); 538 | memcpy(super, super_redund, 539 | sizeof(struct pmfs_super_block)); 540 | if (sb) 541 | pmfs_memlock_super(sb, super); 542 | pmfs_flush_buffer(super, sizeof(*super), false); 543 | pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, 544 | sizeof(*super), false); 545 | } 546 | } 547 | 548 | return 1; 549 | out: 550 | return 0; 551 | } 552 | 553 | static void pmfs_recover_truncate_list(struct super_block *sb) 554 | { 555 | struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb); 556 | u64 ino_next = le64_to_cpu(head->i_next_truncate); 557 | struct pmfs_inode *pi; 558 | struct pmfs_inode_truncate_item *li; 559 | struct inode *inode; 560 | 561 | if (ino_next == 0) 562 | return; 563 | 564 | while (ino_next != 0) { 565 | pi = pmfs_get_inode(sb, ino_next); 566 | li = (struct pmfs_inode_truncate_item *)(pi + 1); 567 | inode = pmfs_iget(sb, ino_next); 568 | if (IS_ERR(inode)) 569 | break; 570 | pmfs_dbg("Recover ino %llx nlink %d sz %llx:%llx\n", ino_next, 571 | inode->i_nlink, pi->i_size, li->i_truncatesize); 572 | if (inode->i_nlink) { 573 | /* set allocation hint */ 574 | pmfs_set_blocksize_hint(sb, pi, 575 | le64_to_cpu(li->i_truncatesize)); 576 | pmfs_setsize(inode, le64_to_cpu(li->i_truncatesize)); 577 | pmfs_update_isize(inode, pi); 578 | } else { 579 | /* free the inode */ 580 | pmfs_dbg("deleting unreferenced inode %lx\n", 581 | inode->i_ino); 582 | } 583 | iput(inode); 584 | pmfs_flush_buffer(pi, CACHELINE_SIZE, false); 585 | ino_next = le64_to_cpu(li->i_next_truncate); 586 | } 587 | PERSISTENT_MARK(); 588 | PERSISTENT_BARRIER(); 589 | /* reset the truncate_list */ 590 | pmfs_memunlock_range(sb, head, sizeof(*head)); 591 | head->i_next_truncate = 0; 592 | pmfs_memlock_range(sb, head, sizeof(*head)); 593 | pmfs_flush_buffer(head, sizeof(*head), false); 594 | PERSISTENT_MARK(); 595 | PERSISTENT_BARRIER(); 596 | } 597 | 598 | static int pmfs_fill_super(struct super_block *sb, void *data, int silent) 599 | { 600 | struct pmfs_super_block *super; 601 | struct pmfs_inode *root_pi; 602 | struct pmfs_sb_info *sbi = NULL; 603 | struct inode *root_i = NULL; 604 | unsigned long blocksize; 605 | u32 random = 0; 606 | int retval = -EINVAL; 607 | 608 | BUILD_BUG_ON(sizeof(struct pmfs_super_block) > PMFS_SB_SIZE); 609 | BUILD_BUG_ON(sizeof(struct pmfs_inode) > PMFS_INODE_SIZE); 610 | 611 | if (arch_has_pcommit()) { 612 | pmfs_info("arch has PCOMMIT support\n"); 613 | support_pcommit = 1; 614 | } else { 615 | pmfs_info("arch does not have PCOMMIT support\n"); 616 | } 617 | 618 | if (arch_has_clwb()) { 619 | pmfs_info("arch has CLWB support\n"); 620 | support_clwb = 1; 621 | } else { 622 | pmfs_info("arch does not have CLWB support\n"); 623 | } 624 | 625 | sbi = kzalloc(sizeof(struct pmfs_sb_info), GFP_KERNEL); 626 | if (!sbi) 627 | return -ENOMEM; 628 | sb->s_fs_info = sbi; 629 | 630 | set_default_opts(sbi); 631 | 632 | if (pmfs_get_block_info(sb, sbi)) 633 | goto out; 634 | 635 | get_random_bytes(&random, sizeof(u32)); 636 | atomic_set(&sbi->next_generation, random); 637 | 638 | /* Init with default values */ 639 | INIT_LIST_HEAD(&sbi->block_inuse_head); 640 | sbi->mode = (S_IRUGO | S_IXUGO | S_IWUSR); 641 | sbi->uid = current_fsuid(); 642 | sbi->gid = current_fsgid(); 643 | set_opt(sbi->s_mount_opt, XIP); 644 | clear_opt(sbi->s_mount_opt, PROTECT); 645 | set_opt(sbi->s_mount_opt, HUGEIOREMAP); 646 | 647 | INIT_LIST_HEAD(&sbi->s_truncate); 648 | mutex_init(&sbi->s_truncate_lock); 649 | mutex_init(&sbi->inode_table_mutex); 650 | mutex_init(&sbi->s_lock); 651 | 652 | if (pmfs_parse_options(data, sbi, 0)) 653 | goto out; 654 | 655 | set_opt(sbi->s_mount_opt, MOUNTING); 656 | 657 | /* Init a new pmfs instance */ 658 | if (sbi->s_mount_opt & PMFS_MOUNT_FORMAT) { 659 | root_pi = pmfs_init(sb, sbi->initsize); 660 | if (IS_ERR(root_pi)) 661 | goto out; 662 | super = pmfs_get_super(sb); 663 | goto setup_sb; 664 | } 665 | pmfs_dbg_verbose("checking physical address 0x%016llx for pmfs image\n", 666 | (u64)sbi->phys_addr); 667 | 668 | super = pmfs_get_super(sb); 669 | 670 | if (pmfs_journal_soft_init(sb)) { 671 | retval = -EINVAL; 672 | printk(KERN_ERR "Journal initialization failed\n"); 673 | goto out; 674 | } 675 | if (pmfs_recover_journal(sb)) { 676 | retval = -EINVAL; 677 | printk(KERN_ERR "Journal recovery failed\n"); 678 | goto out; 679 | } 680 | 681 | if (pmfs_check_integrity(sb, super) == 0) { 682 | pmfs_dbg("Memory contains invalid pmfs %x:%x\n", 683 | le16_to_cpu(super->s_magic), PMFS_SUPER_MAGIC); 684 | goto out; 685 | } 686 | 687 | blocksize = le32_to_cpu(super->s_blocksize); 688 | pmfs_set_blocksize(sb, blocksize); 689 | 690 | pmfs_dbg_verbose("blocksize %lu\n", blocksize); 691 | 692 | /* Read the root inode */ 693 | root_pi = pmfs_get_inode(sb, PMFS_ROOT_INO); 694 | 695 | /* Check that the root inode is in a sane state */ 696 | pmfs_root_check(sb, root_pi); 697 | 698 | #ifdef CONFIG_PMFS_TEST 699 | if (!first_pmfs_super) 700 | first_pmfs_super = sbi->virt_addr; 701 | #endif 702 | 703 | /* Set it all up.. */ 704 | setup_sb: 705 | sb->s_magic = le16_to_cpu(super->s_magic); 706 | sb->s_op = &pmfs_sops; 707 | sb->s_maxbytes = pmfs_max_size(sb->s_blocksize_bits); 708 | sb->s_time_gran = 1; 709 | sb->s_export_op = &pmfs_export_ops; 710 | sb->s_xattr = NULL; 711 | sb->s_flags |= MS_NOSEC; 712 | root_i = pmfs_iget(sb, PMFS_ROOT_INO); 713 | if (IS_ERR(root_i)) { 714 | retval = PTR_ERR(root_i); 715 | goto out; 716 | } 717 | 718 | sb->s_root = d_make_root(root_i); 719 | if (!sb->s_root) { 720 | printk(KERN_ERR "get pmfs root inode failed\n"); 721 | retval = -ENOMEM; 722 | goto out; 723 | } 724 | 725 | pmfs_recover_truncate_list(sb); 726 | /* If the FS was not formatted on this mount, scan the meta-data after 727 | * truncate list has been processed */ 728 | if ((sbi->s_mount_opt & PMFS_MOUNT_FORMAT) == 0) 729 | pmfs_setup_blocknode_map(sb); 730 | 731 | if (!(sb->s_flags & MS_RDONLY)) { 732 | u64 mnt_write_time; 733 | /* update mount time and write time atomically. */ 734 | mnt_write_time = (get_seconds() & 0xFFFFFFFF); 735 | mnt_write_time = mnt_write_time | (mnt_write_time << 32); 736 | 737 | pmfs_memunlock_range(sb, &super->s_mtime, 8); 738 | pmfs_memcpy_atomic(&super->s_mtime, &mnt_write_time, 8); 739 | pmfs_memlock_range(sb, &super->s_mtime, 8); 740 | 741 | pmfs_flush_buffer(&super->s_mtime, 8, false); 742 | PERSISTENT_MARK(); 743 | PERSISTENT_BARRIER(); 744 | } 745 | 746 | clear_opt(sbi->s_mount_opt, MOUNTING); 747 | retval = 0; 748 | return retval; 749 | out: 750 | kfree(sbi); 751 | return retval; 752 | } 753 | 754 | int pmfs_statfs(struct dentry *d, struct kstatfs *buf) 755 | { 756 | struct super_block *sb = d->d_sb; 757 | unsigned long count = 0; 758 | struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info; 759 | 760 | buf->f_type = PMFS_SUPER_MAGIC; 761 | buf->f_bsize = sb->s_blocksize; 762 | 763 | count = sbi->block_end; 764 | buf->f_blocks = sbi->block_end; 765 | buf->f_bfree = buf->f_bavail = pmfs_count_free_blocks(sb); 766 | buf->f_files = (sbi->s_inodes_count); 767 | buf->f_ffree = (sbi->s_free_inodes_count); 768 | buf->f_namelen = PMFS_NAME_LEN; 769 | pmfs_dbg_verbose("pmfs_stats: total 4k free blocks 0x%llx\n", 770 | buf->f_bfree); 771 | pmfs_dbg_verbose("total inodes 0x%x, free inodes 0x%x, " 772 | "blocknodes 0x%lx\n", (sbi->s_inodes_count), 773 | (sbi->s_free_inodes_count), (sbi->num_blocknode_allocated)); 774 | return 0; 775 | } 776 | 777 | static int pmfs_show_options(struct seq_file *seq, struct dentry *root) 778 | { 779 | struct pmfs_sb_info *sbi = PMFS_SB(root->d_sb); 780 | 781 | seq_printf(seq, ",physaddr=0x%016llx", (u64)sbi->phys_addr); 782 | if (sbi->initsize) 783 | seq_printf(seq, ",init=%luk", sbi->initsize >> 10); 784 | if (sbi->blocksize) 785 | seq_printf(seq, ",bs=%lu", sbi->blocksize); 786 | if (sbi->bpi) 787 | seq_printf(seq, ",bpi=%lu", sbi->bpi); 788 | if (sbi->num_inodes) 789 | seq_printf(seq, ",N=%lu", sbi->num_inodes); 790 | if (sbi->mode != (S_IRWXUGO | S_ISVTX)) 791 | seq_printf(seq, ",mode=%03o", sbi->mode); 792 | if (uid_valid(sbi->uid)) 793 | seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->uid)); 794 | if (gid_valid(sbi->gid)) 795 | seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->gid)); 796 | if (test_opt(root->d_sb, ERRORS_RO)) 797 | seq_puts(seq, ",errors=remount-ro"); 798 | if (test_opt(root->d_sb, ERRORS_PANIC)) 799 | seq_puts(seq, ",errors=panic"); 800 | /* memory protection disabled by default */ 801 | if (test_opt(root->d_sb, PROTECT)) 802 | seq_puts(seq, ",wprotect"); 803 | if (test_opt(root->d_sb, HUGEMMAP)) 804 | seq_puts(seq, ",hugemmap"); 805 | if (test_opt(root->d_sb, HUGEIOREMAP)) 806 | seq_puts(seq, ",hugeioremap"); 807 | /* xip not enabled by default */ 808 | if (test_opt(root->d_sb, XIP)) 809 | seq_puts(seq, ",xip"); 810 | 811 | return 0; 812 | } 813 | 814 | int pmfs_remount(struct super_block *sb, int *mntflags, char *data) 815 | { 816 | unsigned long old_sb_flags; 817 | unsigned long old_mount_opt; 818 | struct pmfs_super_block *ps; 819 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 820 | int ret = -EINVAL; 821 | 822 | /* Store the old options */ 823 | mutex_lock(&sbi->s_lock); 824 | old_sb_flags = sb->s_flags; 825 | old_mount_opt = sbi->s_mount_opt; 826 | 827 | if (pmfs_parse_options(data, sbi, 1)) 828 | goto restore_opt; 829 | 830 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 831 | ((sbi->s_mount_opt & PMFS_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 832 | 833 | if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 834 | u64 mnt_write_time; 835 | ps = pmfs_get_super(sb); 836 | /* update mount time and write time atomically. */ 837 | mnt_write_time = (get_seconds() & 0xFFFFFFFF); 838 | mnt_write_time = mnt_write_time | (mnt_write_time << 32); 839 | 840 | pmfs_memunlock_range(sb, &ps->s_mtime, 8); 841 | pmfs_memcpy_atomic(&ps->s_mtime, &mnt_write_time, 8); 842 | pmfs_memlock_range(sb, &ps->s_mtime, 8); 843 | 844 | pmfs_flush_buffer(&ps->s_mtime, 8, false); 845 | PERSISTENT_MARK(); 846 | PERSISTENT_BARRIER(); 847 | } 848 | 849 | mutex_unlock(&sbi->s_lock); 850 | ret = 0; 851 | return ret; 852 | 853 | restore_opt: 854 | sb->s_flags = old_sb_flags; 855 | sbi->s_mount_opt = old_mount_opt; 856 | mutex_unlock(&sbi->s_lock); 857 | return ret; 858 | } 859 | 860 | static void pmfs_put_super(struct super_block *sb) 861 | { 862 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 863 | struct pmfs_blocknode *i; 864 | struct list_head *head = &(sbi->block_inuse_head); 865 | 866 | #ifdef CONFIG_PMFS_TEST 867 | if (first_pmfs_super == sbi->virt_addr) 868 | first_pmfs_super = NULL; 869 | #endif 870 | 871 | /* It's unmount time, so unmap the pmfs memory */ 872 | if (sbi->virt_addr) { 873 | pmfs_save_blocknode_mappings(sb); 874 | pmfs_journal_uninit(sb); 875 | sbi->virt_addr = NULL; 876 | } 877 | 878 | /* Free all the pmfs_blocknodes */ 879 | while (!list_empty(head)) { 880 | i = list_first_entry(head, struct pmfs_blocknode, link); 881 | list_del(&i->link); 882 | pmfs_free_blocknode(sb, i); 883 | } 884 | sb->s_fs_info = NULL; 885 | pmfs_dbgmask = 0; 886 | kfree(sbi); 887 | } 888 | 889 | inline void pmfs_free_transaction(pmfs_transaction_t *trans) 890 | { 891 | kmem_cache_free(pmfs_transaction_cachep, trans); 892 | } 893 | 894 | void __pmfs_free_blocknode(struct pmfs_blocknode *bnode) 895 | { 896 | kmem_cache_free(pmfs_blocknode_cachep, bnode); 897 | } 898 | 899 | void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode) 900 | { 901 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 902 | sbi->num_blocknode_allocated--; 903 | __pmfs_free_blocknode(bnode); 904 | } 905 | 906 | inline pmfs_transaction_t *pmfs_alloc_transaction(void) 907 | { 908 | return (pmfs_transaction_t *) 909 | kmem_cache_alloc(pmfs_transaction_cachep, GFP_NOFS); 910 | } 911 | 912 | struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb) 913 | { 914 | struct pmfs_blocknode *p; 915 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 916 | p = (struct pmfs_blocknode *) 917 | kmem_cache_alloc(pmfs_blocknode_cachep, GFP_NOFS); 918 | if (p) { 919 | sbi->num_blocknode_allocated++; 920 | } 921 | return p; 922 | } 923 | 924 | static struct inode *pmfs_alloc_inode(struct super_block *sb) 925 | { 926 | struct pmfs_inode_info *vi; 927 | 928 | vi = kmem_cache_alloc(pmfs_inode_cachep, GFP_NOFS); 929 | if (!vi) 930 | return NULL; 931 | 932 | // vi->vfs_inode.i_version = 1; 933 | return &vi->vfs_inode; 934 | } 935 | 936 | static void pmfs_i_callback(struct rcu_head *head) 937 | { 938 | struct inode *inode = container_of(head, struct inode, i_rcu); 939 | 940 | kmem_cache_free(pmfs_inode_cachep, PMFS_I(inode)); 941 | } 942 | 943 | static void pmfs_destroy_inode(struct inode *inode) 944 | { 945 | call_rcu(&inode->i_rcu, pmfs_i_callback); 946 | } 947 | 948 | static void init_once(void *foo) 949 | { 950 | struct pmfs_inode_info *vi = foo; 951 | 952 | vi->i_dir_start_lookup = 0; 953 | INIT_LIST_HEAD(&vi->i_truncated); 954 | inode_init_once(&vi->vfs_inode); 955 | } 956 | 957 | 958 | static int __init init_blocknode_cache(void) 959 | { 960 | pmfs_blocknode_cachep = kmem_cache_create("pmfs_blocknode_cache", 961 | sizeof(struct pmfs_blocknode), 962 | 0, (SLAB_RECLAIM_ACCOUNT | 963 | SLAB_MEM_SPREAD), NULL); 964 | if (pmfs_blocknode_cachep == NULL) 965 | return -ENOMEM; 966 | return 0; 967 | } 968 | 969 | 970 | static int __init init_inodecache(void) 971 | { 972 | pmfs_inode_cachep = kmem_cache_create("pmfs_inode_cache", 973 | sizeof(struct pmfs_inode_info), 974 | 0, (SLAB_RECLAIM_ACCOUNT | 975 | SLAB_MEM_SPREAD), init_once); 976 | if (pmfs_inode_cachep == NULL) 977 | return -ENOMEM; 978 | return 0; 979 | } 980 | 981 | static int __init init_transaction_cache(void) 982 | { 983 | pmfs_transaction_cachep = kmem_cache_create("pmfs_journal_transaction", 984 | sizeof(pmfs_transaction_t), 0, (SLAB_RECLAIM_ACCOUNT | 985 | SLAB_MEM_SPREAD), NULL); 986 | if (pmfs_transaction_cachep == NULL) { 987 | pmfs_dbg("PMFS: failed to init transaction cache\n"); 988 | return -ENOMEM; 989 | } 990 | return 0; 991 | } 992 | 993 | static void destroy_transaction_cache(void) 994 | { 995 | if (pmfs_transaction_cachep) 996 | kmem_cache_destroy(pmfs_transaction_cachep); 997 | pmfs_transaction_cachep = NULL; 998 | } 999 | 1000 | static void destroy_inodecache(void) 1001 | { 1002 | /* 1003 | * Make sure all delayed rcu free inodes are flushed before 1004 | * we destroy cache. 1005 | */ 1006 | rcu_barrier(); 1007 | kmem_cache_destroy(pmfs_inode_cachep); 1008 | } 1009 | 1010 | static void destroy_blocknode_cache(void) 1011 | { 1012 | kmem_cache_destroy(pmfs_blocknode_cachep); 1013 | } 1014 | 1015 | /* 1016 | * the super block writes are all done "on the fly", so the 1017 | * super block is never in a "dirty" state, so there's no need 1018 | * for write_super. 1019 | */ 1020 | static struct super_operations pmfs_sops = { 1021 | .alloc_inode = pmfs_alloc_inode, 1022 | .destroy_inode = pmfs_destroy_inode, 1023 | .write_inode = pmfs_write_inode, 1024 | .dirty_inode = pmfs_dirty_inode, 1025 | .evict_inode = pmfs_evict_inode, 1026 | .put_super = pmfs_put_super, 1027 | .statfs = pmfs_statfs, 1028 | .remount_fs = pmfs_remount, 1029 | .show_options = pmfs_show_options, 1030 | }; 1031 | 1032 | static struct dentry *pmfs_mount(struct file_system_type *fs_type, 1033 | int flags, const char *dev_name, void *data) 1034 | { 1035 | return mount_bdev(fs_type, flags, dev_name, data, pmfs_fill_super); 1036 | } 1037 | 1038 | static struct file_system_type pmfs_fs_type = { 1039 | .owner = THIS_MODULE, 1040 | .name = "pmfs", 1041 | .mount = pmfs_mount, 1042 | .kill_sb = kill_block_super, 1043 | }; 1044 | 1045 | static struct inode *pmfs_nfs_get_inode(struct super_block *sb, 1046 | u64 ino, u32 generation) 1047 | { 1048 | struct pmfs_sb_info *sbi = PMFS_SB(sb); 1049 | struct inode *inode; 1050 | 1051 | if (ino < PMFS_ROOT_INO) 1052 | return ERR_PTR(-ESTALE); 1053 | 1054 | if ((ino >> PMFS_INODE_BITS) > (sbi->s_inodes_count)) 1055 | return ERR_PTR(-ESTALE); 1056 | 1057 | inode = pmfs_iget(sb, ino); 1058 | if (IS_ERR(inode)) 1059 | return ERR_CAST(inode); 1060 | 1061 | if (generation && inode->i_generation != generation) { 1062 | /* we didn't find the right inode.. */ 1063 | iput(inode); 1064 | return ERR_PTR(-ESTALE); 1065 | } 1066 | 1067 | return inode; 1068 | } 1069 | 1070 | static struct dentry *pmfs_fh_to_dentry(struct super_block *sb, 1071 | struct fid *fid, int fh_len, 1072 | int fh_type) 1073 | { 1074 | return generic_fh_to_dentry(sb, fid, fh_len, fh_type, 1075 | pmfs_nfs_get_inode); 1076 | } 1077 | 1078 | static struct dentry *pmfs_fh_to_parent(struct super_block *sb, 1079 | struct fid *fid, int fh_len, 1080 | int fh_type) 1081 | { 1082 | return generic_fh_to_parent(sb, fid, fh_len, fh_type, 1083 | pmfs_nfs_get_inode); 1084 | } 1085 | 1086 | static const struct export_operations pmfs_export_ops = { 1087 | .fh_to_dentry = pmfs_fh_to_dentry, 1088 | .fh_to_parent = pmfs_fh_to_parent, 1089 | .get_parent = pmfs_get_parent, 1090 | }; 1091 | 1092 | static int __init init_pmfs_fs(void) 1093 | { 1094 | int rc = 0; 1095 | 1096 | rc = init_blocknode_cache(); 1097 | if (rc) 1098 | return rc; 1099 | 1100 | rc = init_transaction_cache(); 1101 | if (rc) 1102 | goto out1; 1103 | 1104 | rc = init_inodecache(); 1105 | if (rc) 1106 | goto out2; 1107 | 1108 | rc = register_filesystem(&pmfs_fs_type); 1109 | if (rc) 1110 | goto out3; 1111 | 1112 | return 0; 1113 | 1114 | out3: 1115 | destroy_inodecache(); 1116 | out2: 1117 | destroy_transaction_cache(); 1118 | out1: 1119 | destroy_blocknode_cache(); 1120 | return rc; 1121 | } 1122 | 1123 | static void __exit exit_pmfs_fs(void) 1124 | { 1125 | unregister_filesystem(&pmfs_fs_type); 1126 | destroy_inodecache(); 1127 | destroy_blocknode_cache(); 1128 | destroy_transaction_cache(); 1129 | } 1130 | 1131 | MODULE_AUTHOR("Intel Corporation "); 1132 | MODULE_DESCRIPTION("Persistent Memory File System"); 1133 | MODULE_LICENSE("GPL"); 1134 | 1135 | module_init(init_pmfs_fs) 1136 | module_exit(exit_pmfs_fs) 1137 | --------------------------------------------------------------------------------