├── setup-pmfs.sh
├── remount-pmfs.sh
├── intel-setup-pmfs.sh
├── intel-remount-pmfs.sh
├── Makefile
├── xip.h
├── Kconfig
├── pmfs_test.c
├── pmfs_stats.c
├── wprotect.c
├── README.md
├── symlink.c
├── journal.h
├── wprotect.h
├── ioctl.c
├── balloc.c
├── dir.c
├── file.c
├── pmfs_def.h
├── bbuild.c
├── xip.c
├── pmfs.h
├── namei.c
├── journal.c
└── super.c


/setup-pmfs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | umount /mnt/ramdisk
 4 | umount /mnt/scratch
 5 | rmmod pmfs
 6 | insmod pmfs.ko measure_timing=0
 7 | 
 8 | sleep 1
 9 | 
10 | mount -t pmfs -o init /dev/pmem0 /mnt/ramdisk
11 | mount -t pmfs -o init /dev/pmem1 /mnt/scratch
12 | 
13 | 


--------------------------------------------------------------------------------
/remount-pmfs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | umount /mnt/ramdisk
 4 | rmmod pmfs
 5 | insmod pmfs.ko measure_timing=0
 6 | 
 7 | sleep 1
 8 | 
 9 | mount -t pmfs /dev/pmem0 /mnt/ramdisk
10 | 
11 | #cp test1 /mnt/ramdisk/
12 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct
13 | 


--------------------------------------------------------------------------------
/intel-setup-pmfs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | umount /mnt/ramdisk
 4 | rmmod pmfs
 5 | rmmod pmem
 6 | insmod pmfs.ko measure_timing=0
 7 | 
 8 | sleep 1
 9 | 
10 | mount -t pmfs -o physaddr=0x10000000000,init=64G none /mnt/ramdisk
11 | 
12 | #cp test1 /mnt/ramdisk/
13 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct
14 | 


--------------------------------------------------------------------------------
/intel-remount-pmfs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | umount /mnt/ramdisk
 4 | rmmod pmfs
 5 | insmod pmfs.ko measure_timing=0
 6 | 
 7 | sleep 1
 8 | 
 9 | #mount -t pmfs -o physaddr=0x100000000 none /mnt/ramdisk
10 | mount -t pmfs -o physaddr=0x10000000000 none /mnt/ramdisk
11 | 
12 | #cp test1 /mnt/ramdisk/
13 | #dd if=/dev/zero of=/mnt/ramdisk/test1 bs=1M count=1024 oflag=direct
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Makefile for the linux pmfs-filesystem routines.
 3 | #
 4 | 
 5 | obj-m += pmfs.o
 6 | 
 7 | pmfs-y := bbuild.o balloc.o dir.o file.o inode.o namei.o super.o symlink.o ioctl.o pmfs_stats.o journal.o xip.o wprotect.o
 8 | 
 9 | all:
10 | 	make -C /lib/modules/$(shell uname -r)/build M=`pwd`
11 | 
12 | clean:
13 | 	make -C /lib/modules/$(shell uname -r)/build M=`pwd` clean
14 | 


--------------------------------------------------------------------------------
/xip.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BRIEF DESCRIPTION
 3 |  *
 4 |  * XIP operations.
 5 |  *
 6 |  * Copyright 2012-2013 Intel Corporation
 7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
 8 |  * This file is licensed under the terms of the GNU General Public
 9 |  * License version 2. This program is licensed "as is" without any
10 |  * warranty of any kind, whether express or implied.
11 |  */
12 | 
13 | int pmfs_get_xip_mem(struct address_space *, pgoff_t, int, void **,
14 | 		      unsigned long *);
15 | ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf, size_t len,
16 | 			    loff_t *ppos);
17 | ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
18 | 		size_t len, loff_t *ppos);
19 | int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma);
20 | 
21 | static inline int pmfs_use_xip(struct super_block *sb)
22 | {
23 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
24 | 
25 | 	return sbi->s_mount_opt & PMFS_MOUNT_XIP;
26 | }
27 | 
28 | #define mapping_is_xip(map) (map->a_ops->get_xip_mem)
29 | 


--------------------------------------------------------------------------------
/Kconfig:
--------------------------------------------------------------------------------
 1 | config PMFS
 2 | 	tristate "Persistent and Protected PM file system support"
 3 | 	depends on HAS_IOMEM
 4 | 	select CRC16
 5 | 	help
 6 | 	   If your system has a block of fast (comparable in access speed to
 7 | 	   system memory) and non-volatile byte-addressable memory and you wish to
 8 | 	   mount a light-weight, full-featured, and space-efficient filesystem over
 9 | 	   it, say Y here, and read <file:Documentation/filesystems/pmfs.txt>.
10 | 
11 | 	   To compile this as a module,  choose M here: the module will be
12 | 	   called pmfs.
13 | 
14 | config PMFS_XIP
15 | 	bool "Execute-in-place in PMFS"
16 | 	depends on PMFS && BLOCK
17 | 	help
18 | 	   Say Y here to enable XIP feature of PMFS.
19 | 
20 | config PMFS_WRITE_PROTECT
21 | 	bool "PMFS write protection"
22 | 	depends on PMFS && MMU && HAVE_SET_MEMORY_RO
23 | 	default y
24 | 	help
25 | 	   Say Y here to enable the write protect feature of PMFS.
26 | 
27 | config PMFS_TEST
28 | 	boolean
29 | 	depends on PMFS
30 | 
31 | config PMFS_TEST_MODULE
32 | 	tristate "PMFS Test"
33 | 	depends on PMFS && PMFS_WRITE_PROTECT && m
34 | 	select PMFS_TEST
35 | 	help
36 | 	  Say Y here to build a simple module to test the protection of
37 | 	  PMFS. The module will be called pmfs_test.
38 | 


--------------------------------------------------------------------------------
/pmfs_test.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BRIEF DESCRIPTION
 3 |  *
 4 |  * pmfs test module.
 5 |  *
 6 |  * Copyright 2012-2013 Intel Corporation
 7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
 8 |  * Copyright 2003 Sony Corporation
 9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
11 |  * This file is licensed under the terms of the GNU General Public
12 |  * License version 2. This program is licensed "as is" without any
13 |  * warranty of any kind, whether express or implied.
14 |  */
15 | #include <linux/module.h>
16 | #include <linux/version.h>
17 | #include <linux/init.h>
18 | #include <linux/fs.h>
19 | #include "pmfs.h"
20 | 
21 | int __init test_pmfs_write(void)
22 | {
23 | 	struct pmfs_super_block *psb;
24 | 
25 | 	psb = get_pmfs_super();
26 | 	if (!psb) {
27 | 		printk(KERN_ERR
28 | 		       "%s: PMFS super block not found (not mounted?)\n",
29 | 		       __func__);
30 | 		return 1;
31 | 	}
32 | 
33 | 	/*
34 | 	 * Attempt an unprotected clear of checksum information in the
35 | 	 * superblock, this should cause a kernel page protection fault.
36 | 	 */
37 | 	printk("%s: writing to kernel VA %p\n", __func__, psb);
38 | 	psb->s_sum = 0;
39 | 
40 | 	return 0;
41 | }
42 | 
43 | void test_pmfs_write_cleanup(void)
44 | {
45 | }
46 | 
47 | /* Module information */
48 | MODULE_LICENSE("GPL");
49 | module_init(test_pmfs_write);
50 | module_exit(test_pmfs_write_cleanup);
51 | 


--------------------------------------------------------------------------------
/pmfs_stats.c:
--------------------------------------------------------------------------------
 1 | #include "pmfs.h"
 2 | 
 3 | const char *Timingstring[TIMING_NUM] = 
 4 | {
 5 | 	"create",
 6 | 	"unlink",
 7 | 	"readdir",
 8 | 	"xip_read",
 9 | 	"xip_write",
10 | 	"xip_write_fast",
11 | 	"internal_write",
12 | 	"memcpy_read",
13 | 	"memcpy_write",
14 | 	"alloc_blocks",
15 | 	"new_trans",
16 | 	"add_logentry",
17 | 	"commit_trans",
18 | 	"mmap_fault",
19 | 	"fsync",
20 | 	"free_tree",
21 | 	"evict_inode",
22 | 	"recovery",
23 | };
24 | 
25 | unsigned long long Timingstats[TIMING_NUM];
26 | u64 Countstats[TIMING_NUM];
27 | 
28 | atomic64_t fsync_pages = ATOMIC_INIT(0);
29 | 
30 | void pmfs_print_IO_stats(void)
31 | {
32 | 	printk("=========== PMFS I/O stats ===========\n");
33 | 	printk("Fsync %ld pages\n", atomic64_read(&fsync_pages));
34 | }
35 | 
36 | void pmfs_print_timing_stats(void)
37 | {
38 | 	int i;
39 | 
40 | 	printk("======== PMFS kernel timing stats ========\n");
41 | 	for (i = 0; i < TIMING_NUM; i++) {
42 | 		if (measure_timing || Timingstats[i]) {
43 | 			printk("%s: count %llu, timing %llu, average %llu\n",
44 | 				Timingstring[i],
45 | 				Countstats[i],
46 | 				Timingstats[i],
47 | 				Countstats[i] ?
48 | 				Timingstats[i] / Countstats[i] : 0);
49 | 		} else {
50 | 			printk("%s: count %llu\n",
51 | 				Timingstring[i],
52 | 				Countstats[i]);
53 | 		}
54 | 	}
55 | 
56 | 	pmfs_print_IO_stats();
57 | }
58 | 
59 | void pmfs_clear_stats(void)
60 | {
61 | 	int i;
62 | 
63 | 	printk("======== Clear PMFS kernel timing stats ========\n");
64 | 	for (i = 0; i < TIMING_NUM; i++) {
65 | 		Countstats[i] = 0;
66 | 		Timingstats[i] = 0;
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/wprotect.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BRIEF DESCRIPTION
 3 |  *
 4 |  * Write protection for the filesystem pages.
 5 |  *
 6 |  * Copyright 2012-2013 Intel Corporation
 7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
 8 |  * Copyright 2003 Sony Corporation
 9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
11 |  * This file is licensed under the terms of the GNU General Public
12 |  * License version 2. This program is licensed "as is" without any
13 |  * warranty of any kind, whether express or implied.
14 |  */
15 | 
16 | #include <linux/module.h>
17 | #include <linux/fs.h>
18 | #include <linux/mm.h>
19 | #include <linux/io.h>
20 | #include "pmfs.h"
21 | 
22 | static inline void wprotect_disable(void)
23 | {
24 | 	unsigned long cr0_val;
25 | 
26 | 	cr0_val = read_cr0();
27 | 	cr0_val &= (~X86_CR0_WP);
28 | 	write_cr0(cr0_val);
29 | }
30 | 
31 | static inline void wprotect_enable(void)
32 | {
33 | 	unsigned long cr0_val;
34 | 
35 | 	cr0_val = read_cr0();
36 | 	cr0_val |= X86_CR0_WP;
37 | 	write_cr0(cr0_val);
38 | }
39 | 
40 | /* FIXME: Assumes that we are always called in the right order.
41 |  * pmfs_writeable(vaddr, size, 1);
42 |  * pmfs_writeable(vaddr, size, 0);
43 |  */
44 | int pmfs_writeable(void *vaddr, unsigned long size, int rw)
45 | {
46 | 	static unsigned long flags;
47 | 	if (rw) {
48 | 		local_irq_save(flags);
49 | 		wprotect_disable();
50 | 	} else {
51 | 		wprotect_enable();
52 | 		local_irq_restore(flags);
53 | 	}
54 | 	return 0;
55 | }
56 | 
57 | int pmfs_xip_mem_protect(struct super_block *sb, void *vaddr,
58 | 			  unsigned long size, int rw)
59 | {
60 | 	if (!pmfs_is_wprotected(sb))
61 | 		return 0;
62 | 	return pmfs_writeable(vaddr, size, rw);
63 | }
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Porting PMFS to the latest Linux kernel
 2 | 
 3 | ## Introduction
 4 | 
 5 | PMFS is a file system for persistent memory, developed by Intel.
 6 | For more details about PMFS, please check the git repository:
 7 | 
 8 | https://github.com/linux-pmfs/pmfs
 9 | 
10 | This project ports PMFS to the latest Linux kernel so developers can compare PMFS to other file systems on the new kernel.
11 | 
12 | ## Building PMFS
13 | The master branch works on the 4.15 version of x86-64 Linux kernel.
14 | 
15 | To build PMFS, simply run a
16 | 
17 | ~~~
18 | #make
19 | ~~~
20 | 
21 | command.
22 | 
23 | ## Running PMFS
24 | PMFS runs on a physically contiguous memory region that is not used by the Linux kernel, and relies on the kernel NVDIMM support.
25 | 
26 | To run PMFS, first build up your kernel with NVDIMM support enabled (`CONFIG_BLK_DEV_PMEM`), and then you can
27 | reserve the memory space by booting the kernel with `memmap` command line option.
28 | 
29 | For instance, adding `memmap=16G!8G` to the kernel boot parameters will reserve 16GB memory starting from 8GB address, and the kernel will create a `pmem0` block device under the `/dev` directory.
30 | 
31 | After the OS has booted, you can initialize a PMFS instance with the following commands:
32 | 
33 | 
34 | ~~~
35 | #insmod pmfs.ko
36 | #mount -t pmfs -o init /dev/pmem0 /mnt/ramdisk 
37 | ~~~
38 | 
39 | The above commands create a PMFS instance on pmem0 device, and mount on `/mnt/ramdisk`.
40 | 
41 | To recover an existing PMFS instance, mount PMFS without the init option, for example:
42 | 
43 | ~~~
44 | #mount -t pmfs /dev/pmem0 /mnt/ramdisk 
45 | ~~~
46 | 
47 | There are two scripts provided in the source code, `setup-pmfs.sh` and `remount-pmfs.sh` to help setup PMFS.
48 | 
49 | ## Current limitations
50 | 
51 | * PMFS only works on x86-64 kernels.
52 | * PMFS does not currently support extended attributes or ACL.
53 | * PMFS requires the underlying block device to support DAX (Direct Access) feature.
54 | * This project cuts some features of the original PMFS, such as memory protection and huge mmap support. If you need these features, please turn to the original PMFS.
55 | 


--------------------------------------------------------------------------------
/symlink.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * BRIEF DESCRIPTION
 3 |  *
 4 |  * Symlink operations
 5 |  *
 6 |  * Copyright 2012-2013 Intel Corporation
 7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
 8 |  * Copyright 2003 Sony Corporation
 9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
11 |  * This file is licensed under the terms of the GNU General Public
12 |  * License version 2. This program is licensed "as is" without any
13 |  * warranty of any kind, whether express or implied.
14 |  */
15 | 
16 | #include <linux/fs.h>
17 | #include <linux/namei.h>
18 | #include "pmfs.h"
19 | 
20 | int pmfs_block_symlink(struct inode *inode, const char *symname, int len)
21 | {
22 | 	struct super_block *sb = inode->i_sb;
23 | 	u64 block;
24 | 	char *blockp;
25 | 	int err;
26 | 
27 | 	err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
28 | 	if (err)
29 | 		return err;
30 | 
31 | 	block = pmfs_find_data_block(inode, 0);
32 | 	blockp = pmfs_get_block(sb, block);
33 | 
34 | 	pmfs_memunlock_block(sb, blockp);
35 | 	memcpy(blockp, symname, len);
36 | 	blockp[len] = '\0';
37 | 	pmfs_memlock_block(sb, blockp);
38 | 	pmfs_flush_buffer(blockp, len+1, false);
39 | 	return 0;
40 | }
41 | 
42 | /* FIXME: Temporary workaround */
43 | static int pmfs_readlink_copy(char __user *buffer, int buflen, const char *link)
44 | {
45 | 	int len = PTR_ERR(link);
46 | 	if (IS_ERR(link))
47 | 		goto out;
48 | 
49 | 	len = strlen(link);
50 | 	if (len > (unsigned) buflen)
51 | 		len = buflen;
52 | 	if (copy_to_user(buffer, link, len))
53 | 		len = -EFAULT;
54 | out:
55 | 	return len;
56 | }
57 | 
58 | static int pmfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
59 | {
60 | 	struct inode *inode = dentry->d_inode;
61 | 	struct super_block *sb = inode->i_sb;
62 | 	u64 block;
63 | 	char *blockp;
64 | 
65 | 	block = pmfs_find_data_block(inode, 0);
66 | 	blockp = pmfs_get_block(sb, block);
67 | 	return pmfs_readlink_copy(buffer, buflen, blockp);
68 | }
69 | 
70 | static const char *pmfs_get_link(struct dentry *dentry, struct inode *inode,
71 | 	struct delayed_call *done)
72 | {
73 | 	struct super_block *sb = inode->i_sb;
74 | 	off_t block;
75 | 	char *blockp;
76 | 
77 | 	block = pmfs_find_data_block(inode, 0);
78 | 	blockp = pmfs_get_block(sb, block);
79 | 	return blockp;
80 | }
81 | 
82 | const struct inode_operations pmfs_symlink_inode_operations = {
83 | 	.readlink	= pmfs_readlink,
84 | 	.get_link	= pmfs_get_link,
85 | 	.setattr	= pmfs_notify_change,
86 | };
87 | 


--------------------------------------------------------------------------------
/journal.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Persistent Memory File System
  3 |  * Copyright (c) 2012-2013, Intel Corporation.
  4 |  *
  5 |  * This program is free software; you can redistribute it and/or modify it
  6 |  * under the terms and conditions of the GNU General Public License,
  7 |  * version 2, as published by the Free Software Foundation.
  8 |  *
  9 |  * This program is distributed in the hope it will be useful, but WITHOUT
 10 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 12 |  * more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License along with
 15 |  * this program; if not, write to the Free Software Foundation, Inc., 
 16 |  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 17 |  */
 18 | #ifndef __PMFS_JOURNAL_H__
 19 | #define __PMFS_JOURNAL_H__
 20 | #include <linux/slab.h>
 21 | 
 22 | /* default pmfs journal size 4MB */
 23 | #define PMFS_DEFAULT_JOURNAL_SIZE  (4 << 20)
 24 | /* minimum pmfs journal size 64KB */
 25 | #define PMFS_MINIMUM_JOURNAL_SIZE  (1 << 16)
 26 | 
 27 | #define CACHELINE_SIZE  (64)
 28 | #define CLINE_SHIFT		(6)
 29 | #define CACHELINE_MASK  (~(CACHELINE_SIZE - 1))
 30 | #define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
 31 | 
 32 | #define LOGENTRY_SIZE  CACHELINE_SIZE
 33 | #define LESIZE_SHIFT   CLINE_SHIFT
 34 | 
 35 | #define MAX_INODE_LENTRIES (2)
 36 | #define MAX_SB_LENTRIES (2)
 37 | /* 1 le for dir entry and 1 le for potentially allocating a new dir block */
 38 | #define MAX_DIRENTRY_LENTRIES   (2)
 39 | /* 2 le for adding or removing the inode from truncate list. used to log
 40 |  * potential changes to inode table's i_next_truncate and i_sum */
 41 | #define MAX_TRUNCATE_LENTRIES (2)
 42 | #define MAX_DATA_PER_LENTRY  48
 43 | /* blocksize * max_btree_height */
 44 | #define MAX_METABLOCK_LENTRIES \
 45 | 	((PMFS_DEF_BLOCK_SIZE_4K * 3)/MAX_DATA_PER_LENTRY)
 46 | 
 47 | #define MAX_PTRS_PER_LENTRY (MAX_DATA_PER_LENTRY / sizeof(u64))
 48 | 
 49 | #define TRANS_RUNNING    1
 50 | #define TRANS_COMMITTED  2
 51 | #define TRANS_ABORTED    3
 52 | 
 53 | #define LE_DATA        0
 54 | #define LE_START       1
 55 | #define LE_COMMIT      2
 56 | #define LE_ABORT       4
 57 | 
 58 | #define MAX_GEN_ID  ((uint16_t)-1)
 59 | 
 60 | /* persistent data structure to describe a single log-entry */
 61 | /* every log entry is max CACHELINE_SIZE bytes in size */
 62 | typedef struct {
 63 | 	__le64   addr_offset;
 64 | 	__le32   transaction_id;
 65 | 	__le16   gen_id;
 66 | 	u8       type;  /* normal, commit, or abort */
 67 | 	u8       size;
 68 | 	char     data[48];
 69 | } pmfs_logentry_t;
 70 | 
 71 | /* volatile data structure to describe a transaction */
 72 | typedef struct pmfs_transaction {
 73 | 	u32              transaction_id;
 74 | 	u16              num_entries;
 75 | 	u16              num_used;
 76 | 	u16              gen_id;
 77 | 	u16              status;
 78 | 	pmfs_journal_t  *t_journal;
 79 | 	pmfs_logentry_t *start_addr;
 80 | 	struct pmfs_transaction *parent;
 81 | } pmfs_transaction_t;
 82 | 
 83 | extern pmfs_transaction_t *pmfs_alloc_transaction(void);
 84 | extern void pmfs_free_transaction(pmfs_transaction_t *trans);
 85 | 
 86 | extern int pmfs_journal_soft_init(struct super_block *sb);
 87 | extern int pmfs_journal_hard_init(struct super_block *sb,
 88 | 		uint64_t base, uint32_t size);
 89 | extern int pmfs_journal_uninit(struct super_block *sb);
 90 | extern pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
 91 | 		int nclines);
 92 | extern pmfs_transaction_t *pmfs_current_transaction(void);
 93 | extern int pmfs_add_logentry(struct super_block *sb,
 94 | 		pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type);
 95 | extern int pmfs_commit_transaction(struct super_block *sb,
 96 | 		pmfs_transaction_t *trans);
 97 | extern int pmfs_abort_transaction(struct super_block *sb,
 98 | 			pmfs_transaction_t *trans);
 99 | extern int pmfs_recover_journal(struct super_block *sb);
100 | 
101 | #endif    /* __PMFS_JOURNAL_H__ */
102 | 


--------------------------------------------------------------------------------
/wprotect.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * Memory protection definitions for the PMFS filesystem.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2010-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * This file is licensed under the terms of the GNU General Public
  9 |  * License version 2. This program is licensed "as is" without any
 10 |  * warranty of any kind, whether express or implied.
 11 |  */
 12 | 
 13 | #ifndef __WPROTECT_H
 14 | #define __WPROTECT_H
 15 | 
 16 | #include <linux/fs.h>
 17 | #include "pmfs_def.h"
 18 | 
 19 | /* pmfs_memunlock_super() before calling! */
 20 | static inline void pmfs_sync_super(struct pmfs_super_block *ps)
 21 | {
 22 | 	u16 crc = 0;
 23 | 
 24 | 	ps->s_wtime = cpu_to_le32(get_seconds());
 25 | 	ps->s_sum = 0;
 26 | 	crc = crc16(~0, (__u8 *)ps + sizeof(__le16),
 27 | 			PMFS_SB_STATIC_SIZE(ps) - sizeof(__le16));
 28 | 	ps->s_sum = cpu_to_le16(crc);
 29 | 	/* Keep sync redundant super block */
 30 | 	memcpy((void *)ps + PMFS_SB_SIZE, (void *)ps,
 31 | 		sizeof(struct pmfs_super_block));
 32 | }
 33 | 
 34 | #if 0
 35 | /* pmfs_memunlock_inode() before calling! */
 36 | static inline void pmfs_sync_inode(struct pmfs_inode *pi)
 37 | {
 38 | 	u16 crc = 0;
 39 | 
 40 | 	pi->i_sum = 0;
 41 | 	crc = crc16(~0, (__u8 *)pi + sizeof(__le16), PMFS_INODE_SIZE -
 42 | 		    sizeof(__le16));
 43 | 	pi->i_sum = cpu_to_le16(crc);
 44 | }
 45 | #endif
 46 | 
 47 | extern int pmfs_writeable(void *vaddr, unsigned long size, int rw);
 48 | extern int pmfs_xip_mem_protect(struct super_block *sb,
 49 | 				 void *vaddr, unsigned long size, int rw);
 50 | 
 51 | static inline int pmfs_is_protected(struct super_block *sb)
 52 | {
 53 | 	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
 54 | 
 55 | 	return sbi->s_mount_opt & PMFS_MOUNT_PROTECT;
 56 | }
 57 | 
 58 | static inline int pmfs_is_wprotected(struct super_block *sb)
 59 | {
 60 | 	return pmfs_is_protected(sb);
 61 | }
 62 | 
 63 | static inline void
 64 | __pmfs_memunlock_range(void *p, unsigned long len)
 65 | {
 66 | 	/*
 67 | 	 * NOTE: Ideally we should lock all the kernel to be memory safe
 68 | 	 * and avoid to write in the protected memory,
 69 | 	 * obviously it's not possible, so we only serialize
 70 | 	 * the operations at fs level. We can't disable the interrupts
 71 | 	 * because we could have a deadlock in this path.
 72 | 	 */
 73 | 	pmfs_writeable(p, len, 1);
 74 | }
 75 | 
 76 | static inline void
 77 | __pmfs_memlock_range(void *p, unsigned long len)
 78 | {
 79 | 	pmfs_writeable(p, len, 0);
 80 | }
 81 | 
 82 | static inline void pmfs_memunlock_range(struct super_block *sb, void *p,
 83 | 					 unsigned long len)
 84 | {
 85 | 	if (pmfs_is_protected(sb))
 86 | 		__pmfs_memunlock_range(p, len);
 87 | }
 88 | 
 89 | static inline void pmfs_memlock_range(struct super_block *sb, void *p,
 90 | 				       unsigned long len)
 91 | {
 92 | 	if (pmfs_is_protected(sb))
 93 | 		__pmfs_memlock_range(p, len);
 94 | }
 95 | 
 96 | static inline void pmfs_memunlock_super(struct super_block *sb,
 97 | 					 struct pmfs_super_block *ps)
 98 | {
 99 | 	if (pmfs_is_protected(sb))
100 | 		__pmfs_memunlock_range(ps, PMFS_SB_SIZE);
101 | }
102 | 
103 | static inline void pmfs_memlock_super(struct super_block *sb,
104 | 				       struct pmfs_super_block *ps)
105 | {
106 | 	pmfs_sync_super(ps);
107 | 	if (pmfs_is_protected(sb))
108 | 		__pmfs_memlock_range(ps, PMFS_SB_SIZE);
109 | }
110 | 
111 | static inline void pmfs_memunlock_inode(struct super_block *sb,
112 | 					 struct pmfs_inode *pi)
113 | {
114 | 	if (pmfs_is_protected(sb))
115 | 		__pmfs_memunlock_range(pi, PMFS_SB_SIZE);
116 | }
117 | 
118 | static inline void pmfs_memlock_inode(struct super_block *sb,
119 | 				       struct pmfs_inode *pi)
120 | {
121 | 	/* pmfs_sync_inode(pi); */
122 | 	if (pmfs_is_protected(sb))
123 | 		__pmfs_memlock_range(pi, PMFS_SB_SIZE);
124 | }
125 | 
126 | static inline void pmfs_memunlock_block(struct super_block *sb, void *bp)
127 | {
128 | 	if (pmfs_is_protected(sb))
129 | 		__pmfs_memunlock_range(bp, sb->s_blocksize);
130 | }
131 | 
132 | static inline void pmfs_memlock_block(struct super_block *sb, void *bp)
133 | {
134 | 	if (pmfs_is_protected(sb))
135 | 		__pmfs_memlock_range(bp, sb->s_blocksize);
136 | }
137 | 
138 | #endif
139 | 


--------------------------------------------------------------------------------
/ioctl.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * Ioctl operations.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2010-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  *
  9 |  * This file is licensed under the terms of the GNU General Public
 10 |  * License version 2. This program is licensed "as is" without any
 11 |  * warranty of any kind, whether express or implied.
 12 |  */
 13 | 
 14 | #include <linux/capability.h>
 15 | #include <linux/time.h>
 16 | #include <linux/sched.h>
 17 | #include <linux/compat.h>
 18 | #include <linux/mount.h>
 19 | #include "pmfs.h"
 20 | 
 21 | #define	FS_PMFS_FSYNC	0xBCD0000E
 22 | 
 23 | struct sync_range
 24 | {
 25 | 	off_t	offset;
 26 | 	size_t	length;
 27 | };
 28 | 
 29 | long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 30 | {
 31 | 	struct address_space *mapping = filp->f_mapping;
 32 | 	struct inode    *inode = mapping->host;
 33 | 	struct pmfs_inode *pi;
 34 | 	struct super_block *sb = inode->i_sb;
 35 | 	unsigned int flags;
 36 | 	int ret;
 37 | 	pmfs_transaction_t *trans;
 38 | 
 39 | 	pi = pmfs_get_inode(sb, inode->i_ino);
 40 | 	if (!pi)
 41 | 		return -EACCES;
 42 | 
 43 | 	switch (cmd) {
 44 | 	case FS_IOC_GETFLAGS:
 45 | 		flags = le32_to_cpu(pi->i_flags) & PMFS_FL_USER_VISIBLE;
 46 | 		return put_user(flags, (int __user *)arg);
 47 | 	case FS_IOC_SETFLAGS: {
 48 | 		unsigned int oldflags;
 49 | 
 50 | 		ret = mnt_want_write_file(filp);
 51 | 		if (ret)
 52 | 			return ret;
 53 | 
 54 | 		if (!inode_owner_or_capable(inode)) {
 55 | 			ret = -EPERM;
 56 | 			goto flags_out;
 57 | 		}
 58 | 
 59 | 		if (get_user(flags, (int __user *)arg)) {
 60 | 			ret = -EFAULT;
 61 | 			goto flags_out;
 62 | 		}
 63 | 
 64 | 		inode_lock(inode);
 65 | 		oldflags = le32_to_cpu(pi->i_flags);
 66 | 
 67 | 		if ((flags ^ oldflags) &
 68 | 		    (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 69 | 			if (!capable(CAP_LINUX_IMMUTABLE)) {
 70 | 				inode_unlock(inode);
 71 | 				ret = -EPERM;
 72 | 				goto flags_out;
 73 | 			}
 74 | 		}
 75 | 
 76 | 		if (!S_ISDIR(inode->i_mode))
 77 | 			flags &= ~FS_DIRSYNC_FL;
 78 | 
 79 | 		flags = flags & FS_FL_USER_MODIFIABLE;
 80 | 		flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
 81 | 		inode->i_ctime = current_time(inode);
 82 | 		trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
 83 | 		if (IS_ERR(trans)) {
 84 | 			ret = PTR_ERR(trans);
 85 | 			goto out;
 86 | 		}
 87 | 		pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
 88 | 		pmfs_memunlock_inode(sb, pi);
 89 | 		pi->i_flags = cpu_to_le32(flags);
 90 | 		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 91 | 		pmfs_set_inode_flags(inode, pi);
 92 | 		pmfs_memlock_inode(sb, pi);
 93 | 		pmfs_commit_transaction(sb, trans);
 94 | out:
 95 | 		inode_unlock(inode);
 96 | flags_out:
 97 | 		mnt_drop_write_file(filp);
 98 | 		return ret;
 99 | 	}
100 | 	case FS_IOC_GETVERSION:
101 | 		return put_user(inode->i_generation, (int __user *)arg);
102 | 	case FS_IOC_SETVERSION: {
103 | 		__u32 generation;
104 | 		if (!inode_owner_or_capable(inode))
105 | 			return -EPERM;
106 | 		ret = mnt_want_write_file(filp);
107 | 		if (ret)
108 | 			return ret;
109 | 		if (get_user(generation, (int __user *)arg)) {
110 | 			ret = -EFAULT;
111 | 			goto setversion_out;
112 | 		}
113 | 		inode_lock(inode);
114 | 		trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
115 | 		if (IS_ERR(trans)) {
116 | 			ret = PTR_ERR(trans);
117 | 			goto out;
118 | 		}
119 | 		pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
120 | 		inode->i_ctime = current_time(inode);
121 | 		inode->i_generation = generation;
122 | 		pmfs_memunlock_inode(sb, pi);
123 | 		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
124 | 		pi->i_generation = cpu_to_le32(inode->i_generation);
125 | 		pmfs_memlock_inode(sb, pi);
126 | 		pmfs_commit_transaction(sb, trans);
127 | 		inode_unlock(inode);
128 | setversion_out:
129 | 		mnt_drop_write_file(filp);
130 | 		return ret;
131 | 	}
132 | 	case FS_PMFS_FSYNC: {
133 | 		struct sync_range packet;
134 | 		copy_from_user(&packet, (void *)arg, sizeof(struct sync_range));
135 | 		pmfs_fsync(filp, packet.offset, packet.offset + packet.length, 1);
136 | 		return 0;
137 | 	}
138 | 	case PMFS_PRINT_TIMING: {
139 | 		pmfs_print_timing_stats();
140 | 		return 0;
141 | 	}
142 | 	case PMFS_CLEAR_STATS: {
143 | 		pmfs_clear_stats();
144 | 		return 0;
145 | 	}
146 | 	default:
147 | 		return -ENOTTY;
148 | 	}
149 | }
150 | 
151 | #ifdef CONFIG_COMPAT
152 | long pmfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
153 | {
154 | 	switch (cmd) {
155 | 	case FS_IOC32_GETFLAGS:
156 | 		cmd = FS_IOC_GETFLAGS;
157 | 		break;
158 | 	case FS_IOC32_SETFLAGS:
159 | 		cmd = FS_IOC_SETFLAGS;
160 | 		break;
161 | 	case FS_IOC32_GETVERSION:
162 | 		cmd = FS_IOC_GETVERSION;
163 | 		break;
164 | 	case FS_IOC32_SETVERSION:
165 | 		cmd = FS_IOC_SETVERSION;
166 | 		break;
167 | 	default:
168 | 		return -ENOIOCTLCMD;
169 | 	}
170 | 	return pmfs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
171 | }
172 | #endif
173 | 


--------------------------------------------------------------------------------
/balloc.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PMFS emulated persistence. This file contains code to 
  3 |  * handle data blocks of various sizes efficiently.
  4 |  *
  5 |  * Persistent Memory File System
  6 |  * Copyright (c) 2012-2013, Intel Corporation.
  7 |  *
  8 |  * This program is free software; you can redistribute it and/or modify it
  9 |  * under the terms and conditions of the GNU General Public License,
 10 |  * version 2, as published by the Free Software Foundation.
 11 |  *
 12 |  * This program is distributed in the hope it will be useful, but WITHOUT
 13 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 14 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 15 |  * more details.
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License along with
 18 |  * this program; if not, write to the Free Software Foundation, Inc.,
 19 |  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  */
 21 | 
 22 | #include <linux/fs.h>
 23 | #include <linux/bitops.h>
 24 | #include "pmfs.h"
 25 | 
 26 | void pmfs_init_blockmap(struct super_block *sb, unsigned long init_used_size)
 27 | {
 28 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 29 | 	unsigned long num_used_block;
 30 | 	struct pmfs_blocknode *blknode;
 31 | 
 32 | 	num_used_block = (init_used_size + sb->s_blocksize - 1) >>
 33 | 		sb->s_blocksize_bits;
 34 | 
 35 | 	blknode = pmfs_alloc_blocknode(sb);
 36 | 	if (blknode == NULL)
 37 | 		PMFS_ASSERT(0);
 38 | 	blknode->block_low = sbi->block_start;
 39 | 	blknode->block_high = sbi->block_start + num_used_block - 1;
 40 | 	sbi->num_free_blocks -= num_used_block;
 41 | 	list_add(&blknode->link, &sbi->block_inuse_head);
 42 | }
 43 | 
 44 | static struct pmfs_blocknode *pmfs_next_blocknode(struct pmfs_blocknode *i,
 45 | 						  struct list_head *head)
 46 | {
 47 | 	if (list_is_last(&i->link, head))
 48 | 		return NULL;
 49 | 	return list_first_entry(&i->link, typeof(*i), link);
 50 | }
 51 | 
 52 | /* Caller must hold the super_block lock.  If start_hint is provided, it is
 53 |  * only valid until the caller releases the super_block lock. */
 54 | void __pmfs_free_block(struct super_block *sb, unsigned long blocknr,
 55 | 		      unsigned short btype, struct pmfs_blocknode **start_hint)
 56 | {
 57 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 58 | 	struct list_head *head = &(sbi->block_inuse_head);
 59 | 	unsigned long new_block_low;
 60 | 	unsigned long new_block_high;
 61 | 	unsigned long num_blocks = 0;
 62 | 	struct pmfs_blocknode *i;
 63 | 	struct pmfs_blocknode *free_blocknode= NULL;
 64 | 	struct pmfs_blocknode *curr_node;
 65 | 
 66 | 	num_blocks = pmfs_get_numblocks(btype);
 67 | 	new_block_low = blocknr;
 68 | 	new_block_high = blocknr + num_blocks - 1;
 69 | 
 70 | 	BUG_ON(list_empty(head));
 71 | 
 72 | 	if (start_hint && *start_hint &&
 73 | 	    new_block_low >= (*start_hint)->block_low)
 74 | 		i = *start_hint;
 75 | 	else
 76 | 		i = list_first_entry(head, typeof(*i), link);
 77 | 
 78 | 	list_for_each_entry_from(i, head, link) {
 79 | 
 80 | 		if (new_block_low > i->block_high) {
 81 | 			/* skip to next blocknode */
 82 | 			continue;
 83 | 		}
 84 | 
 85 | 		if ((new_block_low == i->block_low) &&
 86 | 			(new_block_high == i->block_high)) {
 87 | 			/* fits entire datablock */
 88 | 			if (start_hint)
 89 | 				*start_hint = pmfs_next_blocknode(i, head);
 90 | 			list_del(&i->link);
 91 | 			free_blocknode = i;
 92 | 			sbi->num_blocknode_allocated--;
 93 | 			sbi->num_free_blocks += num_blocks;
 94 | 			goto block_found;
 95 | 		}
 96 | 		if ((new_block_low == i->block_low) &&
 97 | 			(new_block_high < i->block_high)) {
 98 | 			/* Aligns left */
 99 | 			i->block_low = new_block_high + 1;
100 | 			sbi->num_free_blocks += num_blocks;
101 | 			if (start_hint)
102 | 				*start_hint = i;
103 | 			goto block_found;
104 | 		}
105 | 		if ((new_block_low > i->block_low) && 
106 | 			(new_block_high == i->block_high)) {
107 | 			/* Aligns right */
108 | 			i->block_high = new_block_low - 1;
109 | 			sbi->num_free_blocks += num_blocks;
110 | 			if (start_hint)
111 | 				*start_hint = pmfs_next_blocknode(i, head);
112 | 			goto block_found;
113 | 		}
114 | 		if ((new_block_low > i->block_low) &&
115 | 			(new_block_high < i->block_high)) {
116 | 			/* Aligns somewhere in the middle */
117 | 			curr_node = pmfs_alloc_blocknode(sb);
118 | 			PMFS_ASSERT(curr_node);
119 | 			if (curr_node == NULL) {
120 | 				/* returning without freeing the block*/
121 | 				goto block_found;
122 | 			}
123 | 			curr_node->block_low = new_block_high + 1;
124 | 			curr_node->block_high = i->block_high;
125 | 			i->block_high = new_block_low - 1;
126 | 			list_add(&curr_node->link, &i->link);
127 | 			sbi->num_free_blocks += num_blocks;
128 | 			if (start_hint)
129 | 				*start_hint = curr_node;
130 | 			goto block_found;
131 | 		}
132 | 	}
133 | 
134 | 	pmfs_error_mng(sb, "Unable to free block %ld\n", blocknr);
135 | 
136 | block_found:
137 | 
138 | 	if (free_blocknode)
139 | 		__pmfs_free_blocknode(free_blocknode);
140 | }
141 | 
142 | void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
143 | 		      unsigned short btype)
144 | {
145 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
146 | 	mutex_lock(&sbi->s_lock);
147 | 	__pmfs_free_block(sb, blocknr, btype, NULL);
148 | 	mutex_unlock(&sbi->s_lock);
149 | }
150 | 
151 | int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
152 | 	unsigned short btype, int zero)
153 | {
154 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
155 | 	struct list_head *head = &(sbi->block_inuse_head);
156 | 	struct pmfs_blocknode *i, *next_i;
157 | 	struct pmfs_blocknode *free_blocknode= NULL;
158 | 	void *bp;
159 | 	unsigned long num_blocks = 0;
160 | 	struct pmfs_blocknode *curr_node;
161 | 	int errval = 0;
162 | 	bool found = 0;
163 | 	unsigned long next_block_low;
164 | 	unsigned long new_block_low;
165 | 	unsigned long new_block_high;
166 | 
167 | 	num_blocks = pmfs_get_numblocks(btype);
168 | 
169 | 	mutex_lock(&sbi->s_lock);
170 | 
171 | 	list_for_each_entry(i, head, link) {
172 | 		if (i->link.next == head) {
173 | 			next_i = NULL;
174 | 			next_block_low = sbi->block_end;
175 | 		} else {
176 | 			next_i = list_entry(i->link.next, typeof(*i), link);
177 | 			next_block_low = next_i->block_low;
178 | 		}
179 | 
180 | 		new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
181 | 		new_block_high = new_block_low + num_blocks - 1;
182 | 
183 | 		if (new_block_high >= next_block_low) {
184 | 			/* Does not fit - skip to next blocknode */
185 | 			continue;
186 | 		}
187 | 
188 | 		if ((new_block_low == (i->block_high + 1)) &&
189 | 			(new_block_high == (next_block_low - 1)))
190 | 		{
191 | 			/* Fill the gap completely */
192 | 			if (next_i) {
193 | 				i->block_high = next_i->block_high;
194 | 				list_del(&next_i->link);
195 | 				free_blocknode = next_i;
196 | 				sbi->num_blocknode_allocated--;
197 | 			} else {
198 | 				i->block_high = new_block_high;
199 | 			}
200 | 			found = 1;
201 | 			break;
202 | 		}
203 | 
204 | 		if ((new_block_low == (i->block_high + 1)) &&
205 | 			(new_block_high < (next_block_low - 1))) {
206 | 			/* Aligns to left */
207 | 			i->block_high = new_block_high;
208 | 			found = 1;
209 | 			break;
210 | 		}
211 | 
212 | 		if ((new_block_low > (i->block_high + 1)) &&
213 | 			(new_block_high == (next_block_low - 1))) {
214 | 			/* Aligns to right */
215 | 			if (next_i) {
216 | 				/* right node exist */
217 | 				next_i->block_low = new_block_low;
218 | 			} else {
219 | 				/* right node does NOT exist */
220 | 				curr_node = pmfs_alloc_blocknode(sb);
221 | 				PMFS_ASSERT(curr_node);
222 | 				if (curr_node == NULL) {
223 | 					errval = -ENOSPC;
224 | 					break;
225 | 				}
226 | 				curr_node->block_low = new_block_low;
227 | 				curr_node->block_high = new_block_high;
228 | 				list_add(&curr_node->link, &i->link);
229 | 			}
230 | 			found = 1;
231 | 			break;
232 | 		}
233 | 
234 | 		if ((new_block_low > (i->block_high + 1)) &&
235 | 			(new_block_high < (next_block_low - 1))) {
236 | 			/* Aligns somewhere in the middle */
237 | 			curr_node = pmfs_alloc_blocknode(sb);
238 | 			PMFS_ASSERT(curr_node);
239 | 			if (curr_node == NULL) {
240 | 				errval = -ENOSPC;
241 | 				break;
242 | 			}
243 | 			curr_node->block_low = new_block_low;
244 | 			curr_node->block_high = new_block_high;
245 | 			list_add(&curr_node->link, &i->link);
246 | 			found = 1;
247 | 			break;
248 | 		}
249 | 	}
250 | 	
251 | 	if (found == 1) {
252 | 		sbi->num_free_blocks -= num_blocks;
253 | 	}	
254 | 
255 | 	mutex_unlock(&sbi->s_lock);
256 | 
257 | 	if (free_blocknode)
258 | 		__pmfs_free_blocknode(free_blocknode);
259 | 
260 | 	if (found == 0) {
261 | 		return -ENOSPC;
262 | 	}
263 | 
264 | 	if (zero) {
265 | 		size_t size;
266 | 		bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
267 | 		pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
268 | 		if (btype == PMFS_BLOCK_TYPE_4K)
269 | 			size = 0x1 << 12;
270 | 		else if (btype == PMFS_BLOCK_TYPE_2M)
271 | 			size = 0x1 << 21;
272 | 		else
273 | 			size = 0x1 << 30;
274 | 		memset_nt(bp, 0, size);
275 | 		pmfs_memlock_block(sb, bp);
276 | 	}
277 | 	*blocknr = new_block_low;
278 | 
279 | 	return errval;
280 | }
281 | 
282 | unsigned long pmfs_count_free_blocks(struct super_block *sb)
283 | {
284 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
285 | 	return sbi->num_free_blocks; 
286 | }
287 | 


--------------------------------------------------------------------------------
/dir.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * File operations for directories.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * Copyright 2003 Sony Corporation
  9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
 10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
 11 |  * This file is licensed under the terms of the GNU General Public
 12 |  * License version 2. This program is licensed "as is" without any
 13 |  * warranty of any kind, whether express or implied.
 14 |  */
 15 | 
 16 | #include <linux/fs.h>
 17 | #include <linux/pagemap.h>
 18 | #include "pmfs.h"
 19 | 
 20 | /*
 21 |  *	Parent is locked.
 22 |  */
 23 | 
 24 | #define DT2IF(dt) (((dt) << 12) & S_IFMT)
 25 | #define IF2DT(sif) (((sif) & S_IFMT) >> 12)
 26 | 
 27 | static int pmfs_add_dirent_to_buf(pmfs_transaction_t *trans,
 28 | 	struct dentry *dentry, struct inode *inode,
 29 | 	struct pmfs_direntry *de, u8 *blk_base,  struct pmfs_inode *pidir)
 30 | {
 31 | 	struct inode *dir = dentry->d_parent->d_inode;
 32 | 	const char *name = dentry->d_name.name;
 33 | 	int namelen = dentry->d_name.len;
 34 | 	unsigned short reclen;
 35 | 	int nlen, rlen;
 36 | 	char *top;
 37 | 
 38 | 	reclen = PMFS_DIR_REC_LEN(namelen);
 39 | 	if (!de) {
 40 | 		de = (struct pmfs_direntry *)blk_base;
 41 | 		top = blk_base + dir->i_sb->s_blocksize - reclen;
 42 | 		while ((char *)de <= top) {
 43 | #if 0
 44 | 			if (!pmfs_check_dir_entry("pmfs_add_dirent_to_buf",
 45 | 			    dir, de, blk_base, offset))
 46 | 				return -EIO;
 47 | 			if (pmfs_match(namelen, name, de))
 48 | 				return -EEXIST;
 49 | #endif
 50 | 			rlen = le16_to_cpu(de->de_len);
 51 | 			if (de->ino) {
 52 | 				nlen = PMFS_DIR_REC_LEN(de->name_len);
 53 | 				if ((rlen - nlen) >= reclen)
 54 | 					break;
 55 | 			} else if (rlen >= reclen)
 56 | 				break;
 57 | 			de = (struct pmfs_direntry *)((char *)de + rlen);
 58 | 		}
 59 | 		if ((char *)de > top)
 60 | 			return -ENOSPC;
 61 | 	}
 62 | 	rlen = le16_to_cpu(de->de_len);
 63 | 
 64 | 	if (de->ino) {
 65 | 		struct pmfs_direntry *de1;
 66 | 		pmfs_add_logentry(dir->i_sb, trans, &de->de_len,
 67 | 			sizeof(de->de_len), LE_DATA);
 68 | 		nlen = PMFS_DIR_REC_LEN(de->name_len);
 69 | 		de1 = (struct pmfs_direntry *)((char *)de + nlen);
 70 | 		pmfs_memunlock_block(dir->i_sb, blk_base);
 71 | 		de1->de_len = cpu_to_le16(rlen - nlen);
 72 | 		de->de_len = cpu_to_le16(nlen);
 73 | 		pmfs_memlock_block(dir->i_sb, blk_base);
 74 | 		de = de1;
 75 | 	} else {
 76 | 		pmfs_add_logentry(dir->i_sb, trans, &de->ino,
 77 | 			sizeof(de->ino), LE_DATA);
 78 | 	}
 79 | 	pmfs_memunlock_block(dir->i_sb, blk_base);
 80 | 	/*de->file_type = 0;*/
 81 | 	if (inode) {
 82 | 		de->ino = cpu_to_le64(inode->i_ino);
 83 | 		/*de->file_type = IF2DT(inode->i_mode); */
 84 | 	} else {
 85 | 		de->ino = 0;
 86 | 	}
 87 | 	de->name_len = namelen;
 88 | 	memcpy(de->name, name, namelen);
 89 | 	pmfs_memlock_block(dir->i_sb, blk_base);
 90 | 	pmfs_flush_buffer(de, reclen, false);
 91 | 	/*
 92 | 	 * XXX shouldn't update any times until successful
 93 | 	 * completion of syscall, but too many callers depend
 94 | 	 * on this.
 95 | 	 */
 96 | 	dir->i_mtime = dir->i_ctime = current_time(dir);
 97 | 	/*dir->i_version++; */
 98 | 
 99 | 	pmfs_memunlock_inode(dir->i_sb, pidir);
100 | 	pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
101 | 	pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
102 | 	pmfs_memlock_inode(dir->i_sb, pidir);
103 | 	return 0;
104 | }
105 | 
106 | /* adds a directory entry pointing to the inode. assumes the inode has
107 |  * already been logged for consistency
108 |  */
109 | int pmfs_add_entry(pmfs_transaction_t *trans, struct dentry *dentry,
110 | 		struct inode *inode)
111 | {
112 | 	struct inode *dir = dentry->d_parent->d_inode;
113 | 	struct super_block *sb = dir->i_sb;
114 | 	int retval = -EINVAL;
115 | 	unsigned long block, blocks;
116 | 	struct pmfs_direntry *de;
117 | 	char *blk_base;
118 | 	struct pmfs_inode *pidir;
119 | 
120 | 	if (!dentry->d_name.len)
121 | 		return -EINVAL;
122 | 
123 | 	pidir = pmfs_get_inode(sb, dir->i_ino);
124 | 	pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
125 | 
126 | 	blocks = dir->i_size >> sb->s_blocksize_bits;
127 | 	for (block = 0; block < blocks; block++) {
128 | 		blk_base =
129 | 			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
130 | 		if (!blk_base) {
131 | 			retval = -EIO;
132 | 			goto out;
133 | 		}
134 | 		retval = pmfs_add_dirent_to_buf(trans, dentry, inode,
135 | 				NULL, blk_base, pidir);
136 | 		if (retval != -ENOSPC)
137 | 			goto out;
138 | 	}
139 | 	retval = pmfs_alloc_blocks(trans, dir, blocks, 1, false);
140 | 	if (retval)
141 | 		goto out;
142 | 
143 | 	dir->i_size += dir->i_sb->s_blocksize;
144 | 	pmfs_update_isize(dir, pidir);
145 | 
146 | 	blk_base = pmfs_get_block(sb, pmfs_find_data_block(dir, blocks));
147 | 	if (!blk_base) {
148 | 		retval = -ENOSPC;
149 | 		goto out;
150 | 	}
151 | 	/* No need to log the changes to this de because its a new block */
152 | 	de = (struct pmfs_direntry *)blk_base;
153 | 	pmfs_memunlock_block(sb, blk_base);
154 | 	de->ino = 0;
155 | 	de->de_len = cpu_to_le16(sb->s_blocksize);
156 | 	pmfs_memlock_block(sb, blk_base);
157 | 	/* Since this is a new block, no need to log changes to this block */
158 | 	retval = pmfs_add_dirent_to_buf(NULL, dentry, inode, de, blk_base,
159 | 		pidir);
160 | out:
161 | 	return retval;
162 | }
163 | 
164 | /* removes a directory entry pointing to the inode. assumes the inode has
165 |  * already been logged for consistency
166 |  */
167 | int pmfs_remove_entry(pmfs_transaction_t *trans, struct dentry *de,
168 | 		struct inode *inode)
169 | {
170 | 	struct super_block *sb = inode->i_sb;
171 | 	struct inode *dir = de->d_parent->d_inode;
172 | 	struct pmfs_inode *pidir;
173 | 	struct qstr *entry = &de->d_name;
174 | 	struct pmfs_direntry *res_entry, *prev_entry;
175 | 	int retval = -EINVAL;
176 | 	unsigned long blocks, block;
177 | 	char *blk_base = NULL;
178 | 
179 | 	if (!de->d_name.len)
180 | 		return -EINVAL;
181 | 
182 | 	blocks = dir->i_size >> sb->s_blocksize_bits;
183 | 
184 | 	for (block = 0; block < blocks; block++) {
185 | 		blk_base =
186 | 			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
187 | 		if (!blk_base)
188 | 			goto out;
189 | 		if (pmfs_search_dirblock(blk_base, dir, entry,
190 | 					  block << sb->s_blocksize_bits,
191 | 					  &res_entry, &prev_entry) == 1)
192 | 			break;
193 | 	}
194 | 
195 | 	if (block == blocks)
196 | 		goto out;
197 | 	if (prev_entry) {
198 | 		pmfs_add_logentry(sb, trans, &prev_entry->de_len,
199 | 				sizeof(prev_entry->de_len), LE_DATA);
200 | 		pmfs_memunlock_block(sb, blk_base);
201 | 		prev_entry->de_len =
202 | 			cpu_to_le16(le16_to_cpu(prev_entry->de_len) +
203 | 				    le16_to_cpu(res_entry->de_len));
204 | 		pmfs_memlock_block(sb, blk_base);
205 | 	} else {
206 | 		pmfs_add_logentry(sb, trans, &res_entry->ino,
207 | 				sizeof(res_entry->ino), LE_DATA);
208 | 		pmfs_memunlock_block(sb, blk_base);
209 | 		res_entry->ino = 0;
210 | 		pmfs_memlock_block(sb, blk_base);
211 | 	}
212 | 	/*dir->i_version++; */
213 | 	dir->i_ctime = dir->i_mtime = current_time(dir);
214 | 
215 | 	pidir = pmfs_get_inode(sb, dir->i_ino);
216 | 	pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
217 | 
218 | 	pmfs_memunlock_inode(sb, pidir);
219 | 	pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
220 | 	pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
221 | 	pmfs_memlock_inode(sb, pidir);
222 | 	retval = 0;
223 | out:
224 | 	return retval;
225 | }
226 | 
227 | static int pmfs_readdir(struct file *file, struct dir_context *ctx)
228 | {
229 | 	struct inode *inode = file_inode(file);
230 | 	struct super_block *sb = inode->i_sb;
231 | 	struct pmfs_inode *pi;
232 | 	char *blk_base;
233 | 	unsigned long offset;
234 | 	struct pmfs_direntry *de;
235 | 	ino_t ino;
236 | 	timing_t readdir_time;
237 | 
238 | 	PMFS_START_TIMING(readdir_t, readdir_time);
239 | 
240 | 	offset = ctx->pos & (sb->s_blocksize - 1);
241 | 	while (ctx->pos < inode->i_size) {
242 | 		unsigned long blk = ctx->pos >> sb->s_blocksize_bits;
243 | 
244 | 		blk_base =
245 | 			pmfs_get_block(sb, pmfs_find_data_block(inode, blk));
246 | 		if (!blk_base) {
247 | 			pmfs_dbg("directory %lu contains a hole at offset %lld\n",
248 | 				inode->i_ino, ctx->pos);
249 | 			ctx->pos += sb->s_blocksize - offset;
250 | 			continue;
251 | 		}
252 | #if 0
253 | 		if (file->f_version != inode->i_version) {
254 | 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
255 | 				de = (struct pmfs_direntry *)(blk_base + i);
256 | 				/* It's too expensive to do a full
257 | 				 * dirent test each time round this
258 | 				 * loop, but we do have to test at
259 | 				 * least that it is non-zero.  A
260 | 				 * failure will be detected in the
261 | 				 * dirent test below. */
262 | 				if (le16_to_cpu(de->de_len) <
263 | 				    PMFS_DIR_REC_LEN(1))
264 | 					break;
265 | 				i += le16_to_cpu(de->de_len);
266 | 			}
267 | 			offset = i;
268 | 			ctx->pos =
269 | 				(ctx->pos & ~(sb->s_blocksize - 1)) | offset;
270 | 			file->f_version = inode->i_version;
271 | 		}
272 | #endif
273 | 		while (ctx->pos < inode->i_size
274 | 		       && offset < sb->s_blocksize) {
275 | 			de = (struct pmfs_direntry *)(blk_base + offset);
276 | 			if (!pmfs_check_dir_entry("pmfs_readdir", inode, de,
277 | 						   blk_base, offset)) {
278 | 				/* On error, skip to the next block. */
279 | 				ctx->pos = ALIGN(ctx->pos, sb->s_blocksize);
280 | 				break;
281 | 			}
282 | 			offset += le16_to_cpu(de->de_len);
283 | 			if (de->ino) {
284 | 				ino = le64_to_cpu(de->ino);
285 | 				pi = pmfs_get_inode(sb, ino);
286 | 				if (!dir_emit(ctx, de->name, de->name_len,
287 | 					ino, IF2DT(le16_to_cpu(pi->i_mode))))
288 | 					return 0;
289 | 			}
290 | 			ctx->pos += le16_to_cpu(de->de_len);
291 | 		}
292 | 		offset = 0;
293 | 	}
294 | 	PMFS_END_TIMING(readdir_t, readdir_time);
295 | 	return 0;
296 | }
297 | 
298 | const struct file_operations pmfs_dir_operations = {
299 | 	.read		= generic_read_dir,
300 | 	.iterate	= pmfs_readdir,
301 | 	.fsync		= noop_fsync,
302 | 	.unlocked_ioctl = pmfs_ioctl,
303 | #ifdef CONFIG_COMPAT
304 | 	.compat_ioctl	= pmfs_compat_ioctl,
305 | #endif
306 | };
307 | 


--------------------------------------------------------------------------------
/file.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * File operations for files.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * Copyright 2003 Sony Corporation
  9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
 10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
 11 |  * This file is licensed under the terms of the GNU General Public
 12 |  * License version 2. This program is licensed "as is" without any
 13 |  * warranty of any kind, whether express or implied.
 14 |  */
 15 | 
 16 | #include <linux/fs.h>
 17 | #include <linux/sched.h>
 18 | #include <linux/slab.h>
 19 | #include <linux/uio.h>
 20 | #include <linux/mm.h>
 21 | #include <linux/uaccess.h>
 22 | #include <linux/falloc.h>
 23 | #include <asm/mman.h>
 24 | #include "pmfs.h"
 25 | #include "xip.h"
 26 | 
 27 | static inline int pmfs_can_set_blocksize_hint(struct pmfs_inode *pi,
 28 | 					       loff_t new_size)
 29 | {
 30 | 	/* Currently, we don't deallocate data blocks till the file is deleted.
 31 | 	 * So no changing blocksize hints once allocation is done. */
 32 | 	if (le64_to_cpu(pi->root))
 33 | 		return 0;
 34 | 	return 1;
 35 | }
 36 | 
 37 | int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
 38 | 		loff_t new_size)
 39 | {
 40 | 	unsigned short block_type;
 41 | 
 42 | 	if (!pmfs_can_set_blocksize_hint(pi, new_size))
 43 | 		return 0;
 44 | 
 45 | 	if (new_size >= 0x40000000) {   /* 1G */
 46 | 		block_type = PMFS_BLOCK_TYPE_1G;
 47 | 		goto hint_set;
 48 | 	}
 49 | 
 50 | 	if (new_size >= 0x200000) {     /* 2M */
 51 | 		block_type = PMFS_BLOCK_TYPE_2M;
 52 | 		goto hint_set;
 53 | 	}
 54 | 
 55 | 	/* defaulting to 4K */
 56 | 	block_type = PMFS_BLOCK_TYPE_4K;
 57 | 
 58 | hint_set:
 59 | 	pmfs_dbg_verbose(
 60 | 		"Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n",
 61 | 		new_size, pi->i_size, le64_to_cpu(pi->root));
 62 | 	pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type);
 63 | 	pmfs_memunlock_inode(sb, pi);
 64 | 	pi->i_blk_type = block_type;
 65 | 	pmfs_memlock_inode(sb, pi);
 66 | 	return 0;
 67 | }
 68 | 
 69 | static long pmfs_fallocate(struct file *file, int mode, loff_t offset,
 70 | 			    loff_t len)
 71 | {
 72 | 	struct inode *inode = file->f_path.dentry->d_inode;
 73 | 	struct super_block *sb = inode->i_sb;
 74 | 	long ret = 0;
 75 | 	unsigned long blocknr, blockoff;
 76 | 	int num_blocks, blocksize_mask;
 77 | 	struct pmfs_inode *pi;
 78 | 	pmfs_transaction_t *trans;
 79 | 	loff_t new_size;
 80 | 
 81 | 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
 82 | 	if (mode & ~FALLOC_FL_KEEP_SIZE)
 83 | 		return -EOPNOTSUPP;
 84 | 
 85 | 	if (S_ISDIR(inode->i_mode))
 86 | 		return -ENODEV;
 87 | 
 88 | 	inode_lock(inode);
 89 | 
 90 | 	new_size = len + offset;
 91 | 	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
 92 | 		ret = inode_newsize_ok(inode, new_size);
 93 | 		if (ret)
 94 | 			goto out;
 95 | 	}
 96 | 
 97 | 	pi = pmfs_get_inode(sb, inode->i_ino);
 98 | 	if (!pi) {
 99 | 		ret = -EACCES;
100 | 		goto out;
101 | 	}
102 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES +
103 | 			MAX_METABLOCK_LENTRIES);
104 | 	if (IS_ERR(trans)) {
105 | 		ret = PTR_ERR(trans);
106 | 		goto out;
107 | 	}
108 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
109 | 
110 | 	/* Set the block size hint */
111 | 	pmfs_set_blocksize_hint(sb, pi, new_size);
112 | 
113 | 	blocksize_mask = sb->s_blocksize - 1;
114 | 	blocknr = offset >> sb->s_blocksize_bits;
115 | 	blockoff = offset & blocksize_mask;
116 | 	num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
117 | 	ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true);
118 | 
119 | 	inode->i_mtime = inode->i_ctime = current_time(inode);
120 | 
121 | 	pmfs_memunlock_inode(sb, pi);
122 | 	if (ret || (mode & FALLOC_FL_KEEP_SIZE)) {
123 | 		pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL);
124 | 	}
125 | 
126 | 	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
127 | 		inode->i_size = new_size;
128 | 		pi->i_size = cpu_to_le64(inode->i_size);
129 | 	}
130 | 	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
131 | 	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
132 | 	pmfs_memlock_inode(sb, pi);
133 | 
134 | 	pmfs_commit_transaction(sb, trans);
135 | 
136 | out:
137 | 	inode_unlock(inode);
138 | 	return ret;
139 | }
140 | 
141 | static loff_t pmfs_llseek(struct file *file, loff_t offset, int origin)
142 | {
143 | 	struct inode *inode = file->f_path.dentry->d_inode;
144 | 	int retval;
145 | 
146 | 	if (origin != SEEK_DATA && origin != SEEK_HOLE)
147 | 		return generic_file_llseek(file, offset, origin);
148 | 
149 | 	inode_lock(inode);
150 | 	switch (origin) {
151 | 	case SEEK_DATA:
152 | 		retval = pmfs_find_region(inode, &offset, 0);
153 | 		if (retval) {
154 | 			inode_unlock(inode);
155 | 			return retval;
156 | 		}
157 | 		break;
158 | 	case SEEK_HOLE:
159 | 		retval = pmfs_find_region(inode, &offset, 1);
160 | 		if (retval) {
161 | 			inode_unlock(inode);
162 | 			return retval;
163 | 		}
164 | 		break;
165 | 	}
166 | 
167 | 	if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
168 | 	    offset > inode->i_sb->s_maxbytes) {
169 | 		inode_unlock(inode);
170 | 		return -EINVAL;
171 | 	}
172 | 
173 | 	if (offset != file->f_pos) {
174 | 		file->f_pos = offset;
175 | 		file->f_version = 0;
176 | 	}
177 | 
178 | 	inode_unlock(inode);
179 | 	return offset;
180 | }
181 | 
182 | /* This function is called by both msync() and fsync().
183 |  * TODO: Check if we can avoid calling pmfs_flush_buffer() for fsync. We use
184 |  * movnti to write data to files, so we may want to avoid doing unnecessary
185 |  * pmfs_flush_buffer() on fsync() */
186 | int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
187 | {
188 | 	/* Sync from start to end[inclusive] */
189 | 	struct address_space *mapping = file->f_mapping;
190 | 	struct inode *inode = mapping->host;
191 | 	loff_t isize;
192 | 	timing_t fsync_time;
193 | 
194 | 	PMFS_START_TIMING(fsync_t, fsync_time);
195 | 	/* if the file is not mmap'ed, there is no need to do clflushes */
196 | 	if (mapping_mapped(mapping) == 0)
197 | 		goto persist;
198 | 
199 | 	end += 1; /* end is inclusive. We like our indices normal please ! */
200 | 
201 | 	isize = i_size_read(inode);
202 | 
203 | 	if ((unsigned long)end > (unsigned long)isize)
204 | 		end = isize;
205 | 	if (!isize || (start >= end))
206 | 	{
207 | 		pmfs_dbg_verbose("[%s:%d] : (ERR) isize(%llx), start(%llx),"
208 | 			" end(%llx)\n", __func__, __LINE__, isize, start, end);
209 | 		PMFS_END_TIMING(fsync_t, fsync_time);
210 | 		return -ENODATA;
211 | 	}
212 | 
213 | 	/* Align start and end to cacheline boundaries */
214 | 	start = start & CACHELINE_MASK;
215 | 	end = CACHELINE_ALIGN(end);
216 | 	do {
217 | 		sector_t block = 0;
218 | 		void *xip_mem;
219 | 		pgoff_t pgoff;
220 | 		loff_t offset;
221 | 		unsigned long nr_flush_bytes;
222 | 
223 | 		pgoff = start >> PAGE_SHIFT;
224 | 		offset = start & ~PAGE_MASK;
225 | 
226 | 		nr_flush_bytes = PAGE_SIZE - offset;
227 | 		if (nr_flush_bytes > (end - start))
228 | 			nr_flush_bytes = end - start;
229 | 
230 | 		block = pmfs_find_data_block(inode, (sector_t)pgoff);
231 | 
232 | 		if (block) {
233 | 			xip_mem = pmfs_get_block(inode->i_sb, block);
234 | 			/* flush the range */
235 | 			atomic64_inc(&fsync_pages);
236 | 			pmfs_flush_buffer(xip_mem + offset, nr_flush_bytes, 0);
237 | 		} else {
238 | 			/* sparse files could have such holes */
239 | 			pmfs_dbg_verbose("[%s:%d] : start(%llx), end(%llx),"
240 | 			" pgoff(%lx)\n", __func__, __LINE__, start, end, pgoff);
241 | 			break;
242 | 		}
243 | 
244 | 		start += nr_flush_bytes;
245 | 	} while (start < end);
246 | persist:
247 | 	PERSISTENT_MARK();
248 | 	PERSISTENT_BARRIER();
249 | 	PMFS_END_TIMING(fsync_t, fsync_time);
250 | 	return 0;
251 | }
252 | 
253 | /* This callback is called when a file is closed */
254 | static int pmfs_flush(struct file *file, fl_owner_t id)
255 | {
256 | 	int ret = 0;
257 | 	/* if the file was opened for writing, make it persistent.
258 | 	 * TODO: Should we be more smart to check if the file was modified? */
259 | 	if (file->f_mode & FMODE_WRITE) {
260 | 		PERSISTENT_MARK();
261 | 		PERSISTENT_BARRIER();
262 | 	}
263 | 
264 | 	return ret;
265 | }
266 | 
267 | #if 0
268 | static unsigned long
269 | pmfs_get_unmapped_area(struct file *file, unsigned long addr,
270 | 			unsigned long len, unsigned long pgoff,
271 | 			unsigned long flags)
272 | {
273 | 	unsigned long align_size;
274 | 	struct vm_area_struct *vma;
275 | 	struct mm_struct *mm = current->mm;
276 | 	struct inode *inode = file->f_mapping->host;
277 | 	struct pmfs_inode *pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
278 | 	struct vm_unmapped_area_info info;
279 | 
280 | 	if (len > TASK_SIZE)
281 | 		return -ENOMEM;
282 | 
283 | 	if (pi->i_blk_type == PMFS_BLOCK_TYPE_1G)
284 | 		align_size = PUD_SIZE;
285 | 	else if (pi->i_blk_type == PMFS_BLOCK_TYPE_2M)
286 | 		align_size = PMD_SIZE;
287 | 	else
288 | 		align_size = PAGE_SIZE;
289 | 
290 | 	if (flags & MAP_FIXED) {
291 | 		/* FIXME: We could use 4K mappings as fallback. */
292 | 		if (len & (align_size - 1))
293 | 			return -EINVAL;
294 | 		if (addr & (align_size - 1))
295 | 			return -EINVAL;
296 | 		return addr;
297 | 	}
298 | 
299 | 	if (addr) {
300 | 		addr = ALIGN(addr, align_size);
301 | 		vma = find_vma(mm, addr);
302 | 		if (TASK_SIZE - len >= addr &&
303 | 		    (!vma || addr + len <= vma->vm_start))
304 | 			return addr;
305 | 	}
306 | 
307 | 	/*
308 | 	 * FIXME: Using the following values for low_limit and high_limit
309 | 	 * implicitly disables ASLR. Awaiting a better way to have this fixed.
310 | 	 */
311 | 	info.flags = 0;
312 | 	info.length = len;
313 | 	info.low_limit = TASK_UNMAPPED_BASE;
314 | 	info.high_limit = TASK_SIZE;
315 | 	info.align_mask = align_size - 1;
316 | 	info.align_offset = 0;
317 | 	return vm_unmapped_area(&info);
318 | }
319 | #endif
320 | 
321 | const struct file_operations pmfs_xip_file_operations = {
322 | 	.llseek			= pmfs_llseek,
323 | 	.read			= pmfs_xip_file_read,
324 | 	.write			= pmfs_xip_file_write,
325 | //	.aio_read		= xip_file_aio_read,
326 | //	.aio_write		= xip_file_aio_write,
327 | //	.read_iter		= generic_file_read_iter,
328 | //	.write_iter		= generic_file_write_iter,
329 | 	.mmap			= pmfs_xip_file_mmap,
330 | 	.open			= generic_file_open,
331 | 	.fsync			= pmfs_fsync,
332 | 	.flush			= pmfs_flush,
333 | //	.get_unmapped_area	= pmfs_get_unmapped_area,
334 | 	.unlocked_ioctl		= pmfs_ioctl,
335 | 	.fallocate		= pmfs_fallocate,
336 | #ifdef CONFIG_COMPAT
337 | 	.compat_ioctl		= pmfs_compat_ioctl,
338 | #endif
339 | };
340 | 
341 | const struct inode_operations pmfs_file_inode_operations = {
342 | 	.setattr	= pmfs_notify_change,
343 | 	.getattr	= pmfs_getattr,
344 | 	.get_acl	= NULL,
345 | };
346 | 


--------------------------------------------------------------------------------
/pmfs_def.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FILE NAME include/linux/pmfs_fs.h
  3 |  *
  4 |  * BRIEF DESCRIPTION
  5 |  *
  6 |  * Definitions for the PMFS filesystem.
  7 |  *
  8 |  * Copyright 2012-2013 Intel Corporation
  9 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
 10 |  * Copyright 2003 Sony Corporation
 11 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
 12 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
 13 |  * This file is licensed under the terms of the GNU General Public
 14 |  * License version 2. This program is licensed "as is" without any
 15 |  * warranty of any kind, whether express or implied.
 16 |  */
 17 | #ifndef _LINUX_PMFS_DEF_H
 18 | #define _LINUX_PMFS_DEF_H
 19 | 
 20 | #include <linux/types.h>
 21 | #include <linux/magic.h>
 22 | 
 23 | #define	PMFS_SUPER_MAGIC	0xEFFC
 24 | 
 25 | /*
 26 |  * The PMFS filesystem constants/structures
 27 |  */
 28 | 
 29 | /*
 30 |  * Mount flags
 31 |  */
 32 | #define PMFS_MOUNT_PROTECT 0x000001            /* wprotect CR0.WP */
 33 | #define PMFS_MOUNT_XATTR_USER 0x000002         /* Extended user attributes */
 34 | #define PMFS_MOUNT_POSIX_ACL 0x000004          /* POSIX Access Control Lists */
 35 | #define PMFS_MOUNT_XIP 0x000008                /* Execute in place */
 36 | #define PMFS_MOUNT_ERRORS_CONT 0x000010        /* Continue on errors */
 37 | #define PMFS_MOUNT_ERRORS_RO 0x000020          /* Remount fs ro on errors */
 38 | #define PMFS_MOUNT_ERRORS_PANIC 0x000040       /* Panic on errors */
 39 | #define PMFS_MOUNT_HUGEMMAP 0x000080           /* Huge mappings with mmap */
 40 | #define PMFS_MOUNT_HUGEIOREMAP 0x000100        /* Huge mappings with ioremap */
 41 | #define PMFS_MOUNT_PROTECT_OLD 0x000200        /* wprotect PAGE RW Bit */
 42 | #define PMFS_MOUNT_FORMAT      0x000400        /* was FS formatted on mount? */
 43 | #define PMFS_MOUNT_MOUNTING    0x000800        /* FS currently being mounted */
 44 | 
 45 | /*
 46 |  * Maximal count of links to a file
 47 |  */
 48 | #define PMFS_LINK_MAX          32000
 49 | 
 50 | #define PMFS_DEF_BLOCK_SIZE_4K 4096
 51 | 
 52 | #define PMFS_INODE_SIZE 128    /* must be power of two */
 53 | #define PMFS_INODE_BITS   7
 54 | 
 55 | #define PMFS_NAME_LEN 255
 56 | /*
 57 |  * Structure of a directory entry in PMFS.
 58 |  */
 59 | struct pmfs_direntry {
 60 | 	__le64	ino;                    /* inode no pointed to by this entry */
 61 | 	__le16	de_len;                 /* length of this directory entry */
 62 | 	u8	name_len;               /* length of the directory entry name */
 63 | 	u8	file_type;              /* file type */
 64 | 	char	name[PMFS_NAME_LEN];   /* File name */
 65 | };
 66 | 
 67 | #define PMFS_DIR_PAD            4
 68 | #define PMFS_DIR_ROUND          (PMFS_DIR_PAD - 1)
 69 | #define PMFS_DIR_REC_LEN(name_len)  (((name_len) + 12 + PMFS_DIR_ROUND) & \
 70 | 				      ~PMFS_DIR_ROUND)
 71 | 
 72 | /* PMFS supported data blocks */
 73 | #define PMFS_BLOCK_TYPE_4K     0
 74 | #define PMFS_BLOCK_TYPE_2M     1
 75 | #define PMFS_BLOCK_TYPE_1G     2
 76 | #define PMFS_BLOCK_TYPE_MAX    3
 77 | 
 78 | #define META_BLK_SHIFT 9
 79 | 
 80 | /*
 81 |  * Play with this knob to change the default block type.
 82 |  * By changing the PMFS_DEFAULT_BLOCK_TYPE to 2M or 1G,
 83 |  * we should get pretty good coverage in testing.
 84 |  */
 85 | #define PMFS_DEFAULT_BLOCK_TYPE PMFS_BLOCK_TYPE_4K
 86 | 
 87 | /*
 88 |  * Structure of an inode in PMFS. Things to keep in mind when modifying it.
 89 |  * 1) Keep the inode size to within 96 bytes if possible. This is because
 90 |  *    a 64 byte log-entry can store 48 bytes of data and we would like
 91 |  *    to log an inode using only 2 log-entries
 92 |  * 2) root must be immediately after the qw containing height because we update
 93 |  *    root and height atomically using cmpxchg16b in pmfs_decrease_btree_height 
 94 |  * 3) i_size, i_ctime, and i_mtime must be in that order and i_size must be at
 95 |  *    16 byte aligned offset from the start of the inode. We use cmpxchg16b to
 96 |  *    update these three fields atomically.
 97 |  */
 98 | struct pmfs_inode {
 99 | 	/* first 48 bytes */
100 | 	__le16	i_rsvd;         /* reserved. used to be checksum */
101 | 	u8	    height;         /* height of data b-tree; max 3 for now */
102 | 	u8	    i_blk_type;     /* data block size this inode uses */
103 | 	__le32	i_flags;            /* Inode flags */
104 | 	__le64	root;               /* btree root. must be below qw w/ height */
105 | 	__le64	i_size;             /* Size of data in bytes */
106 | 	__le32	i_ctime;            /* Inode modification time */
107 | 	__le32	i_mtime;            /* Inode b-tree Modification time */
108 | 	__le32	i_dtime;            /* Deletion Time */
109 | 	__le16	i_mode;             /* File mode */
110 | 	__le16	i_links_count;      /* Links count */
111 | 	__le64	i_blocks;           /* Blocks count */
112 | 
113 | 	/* second 48 bytes */
114 | 	__le64	i_xattr;            /* Extended attribute block */
115 | 	__le32	i_uid;              /* Owner Uid */
116 | 	__le32	i_gid;              /* Group Id */
117 | 	__le32	i_generation;       /* File version (for NFS) */
118 | 	__le32	i_atime;            /* Access time */
119 | 
120 | 	struct {
121 | 		__le32 rdev;    /* major/minor # */
122 | 	} dev;              /* device inode */
123 | 	__le32 padding;     /* pad to ensure truncate_item starts 8-byte aligned */
124 | };
125 | 
126 | /* This is a per-inode structure and follows immediately after the 
127 |  * struct pmfs_inode. It is used to implement the truncate linked list and is 
128 |  * by pmfs_truncate_add(), pmfs_truncate_del(), and pmfs_recover_truncate_list()
129 |  * functions to manage the truncate list */
130 | struct pmfs_inode_truncate_item {
131 | 	__le64	i_truncatesize;     /* Size of truncated inode */
132 | 	__le64  i_next_truncate;    /* inode num of the next truncated inode */
133 | };
134 | 
135 | /*
136 |  * #define PMFS_NAME_LEN (PMFS_INODE_SIZE - offsetof(struct pmfs_inode,
137 |  *         i_d.d_name) - 1)
138 |  */
139 | 
140 | /* #define PMFS_SB_SIZE 128 */ /* must be power of two */
141 | #define PMFS_SB_SIZE 512       /* must be power of two */
142 | 
143 | typedef struct pmfs_journal {
144 | 	__le64     base;
145 | 	__le32     size;
146 | 	__le32     head;
147 | 	/* the next three fields must be in the same order and together.
148 | 	 * tail and gen_id must fall in the same 8-byte quadword */
149 | 	__le32     tail;
150 | 	__le16     gen_id;   /* generation id of the log */
151 | 	__le16     pad;
152 | 	__le16     redo_logging;
153 | } pmfs_journal_t;
154 | 
155 | 
156 | /*
157 |  * Structure of the super block in PMFS
158 |  * The fields are partitioned into static and dynamic fields. The static fields
159 |  * never change after file system creation. This was primarily done because
160 |  * pmfs_get_block() returns NULL if the block offset is 0 (helps in catching
161 |  * bugs). So if we modify any field using journaling (for consistency), we 
162 |  * will have to modify s_sum which is at offset 0. So journaling code fails.
163 |  * This (static+dynamic fields) is a temporary solution and can be avoided
164 |  * once the file system becomes stable and pmfs_get_block() returns correct
165 |  * pointers even for offset 0.
166 |  */
167 | struct pmfs_super_block {
168 | 	/* static fields. they never change after file system creation.
169 | 	 * checksum only validates up to s_start_dynamic field below */
170 | 	__le16		s_sum;              /* checksum of this sb */
171 | 	__le16		s_magic;            /* magic signature */
172 | 	__le32		s_blocksize;        /* blocksize in bytes */
173 | 	__le64		s_size;             /* total size of fs in bytes */
174 | 	char		s_volume_name[16];  /* volume name */
175 | 	/* points to the location of pmfs_journal_t */
176 | 	__le64          s_journal_offset;
177 | 	/* points to the location of struct pmfs_inode for the inode table */
178 | 	__le64          s_inode_table_offset;
179 | 
180 | 	__le64       s_start_dynamic; 
181 | 
182 | 	/* all the dynamic fields should go here */
183 | 	/* s_mtime and s_wtime should be together and their order should not be
184 | 	 * changed. we use an 8 byte write to update both of them atomically */
185 | 	__le32		s_mtime;            /* mount time */
186 | 	__le32		s_wtime;            /* write time */
187 | 	/* fields for fast mount support. Always keep them together */
188 | 	__le64		s_num_blocknode_allocated;
189 | 	__le64		s_num_free_blocks;
190 | 	__le32		s_inodes_count;
191 | 	__le32		s_free_inodes_count;
192 | 	__le32		s_inodes_used_count;
193 | 	__le32		s_free_inode_hint;
194 | };
195 | 
196 | #define PMFS_SB_STATIC_SIZE(ps) ((u64)&ps->s_start_dynamic - (u64)ps)
197 | 
198 | /* the above fast mount fields take total 32 bytes in the super block */
199 | #define PMFS_FAST_MOUNT_FIELD_SIZE  (36)
200 | 
201 | /* The root inode follows immediately after the redundant super block */
202 | #define PMFS_ROOT_INO (PMFS_INODE_SIZE)
203 | #define PMFS_BLOCKNODE_IN0 (PMFS_ROOT_INO + PMFS_INODE_SIZE)
204 | 
205 | /* INODE HINT  START at 3 */ 
206 | #define PMFS_FREE_INODE_HINT_START      (3)
207 | 
208 | /* ======================= Write ordering ========================= */
209 | 
210 | #define CACHELINE_SIZE  (64)
211 | #define CACHELINE_MASK  (~(CACHELINE_SIZE - 1))
212 | #define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
213 | 
214 | #define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
215 | #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
216 | #define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
217 | 
218 | static inline bool arch_has_pcommit(void)
219 | {
220 | 	return static_cpu_has(X86_FEATURE_PCOMMIT);
221 | }
222 | 
223 | static inline bool arch_has_clwb(void)
224 | {
225 | 	return static_cpu_has(X86_FEATURE_CLWB);
226 | }
227 | 
228 | extern int support_clwb;
229 | extern int support_pcommit;
230 | 
231 | #define _mm_clflush(addr)\
232 | 	asm volatile("clflush %0" : "+m" (*(volatile char *)(addr)))
233 | #define _mm_clflushopt(addr)\
234 | 	asm volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr)))
235 | #define _mm_clwb(addr)\
236 | 	asm volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr)))
237 | #define _mm_pcommit()\
238 | 	asm volatile(".byte 0x66, 0x0f, 0xae, 0xf8")
239 | 
240 | /* Provides ordering from all previous clflush too */
241 | static inline void PERSISTENT_MARK(void)
242 | {
243 | 	/* TODO: Fix me. */
244 | }
245 | 
246 | static inline void PERSISTENT_BARRIER(void)
247 | {
248 | 	asm volatile ("sfence\n" : : );
249 | 	if (support_pcommit) {
250 | 		/* Do nothing */
251 | 	}
252 | }
253 | 
254 | static inline void pmfs_flush_buffer(void *buf, uint32_t len, bool fence)
255 | {
256 | 	uint32_t i;
257 | 	len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1));
258 | 	if (support_clwb) {
259 | 		for (i = 0; i < len; i += CACHELINE_SIZE)
260 | 			_mm_clwb(buf + i);
261 | 	} else {
262 | 		for (i = 0; i < len; i += CACHELINE_SIZE)
263 | 			_mm_clflush(buf + i);
264 | 	}
265 | 	/* Do a fence only if asked. We often don't need to do a fence
266 | 	 * immediately after clflush because even if we get context switched
267 | 	 * between clflush and subsequent fence, the context switch operation
268 | 	 * provides implicit fence. */
269 | 	if (fence)
270 | 		PERSISTENT_BARRIER();
271 | }
272 | 
273 | #endif /* _LINUX_PMFS_DEF_H */
274 | 


--------------------------------------------------------------------------------
/bbuild.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PMFS emulated persistence. This file contains code to 
  3 |  * handle data blocks of various sizes efficiently.
  4 |  *
  5 |  * Persistent Memory File System
  6 |  * Copyright (c) 2012-2013, Intel Corporation.
  7 |  *
  8 |  * This program is free software; you can redistribute it and/or modify it
  9 |  * under the terms and conditions of the GNU General Public License,
 10 |  * version 2, as published by the Free Software Foundation.
 11 |  *
 12 |  * This program is distributed in the hope it will be useful, but WITHOUT
 13 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 14 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 15 |  * more details.
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License along with
 18 |  * this program; if not, write to the Free Software Foundation, Inc.,
 19 |  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  */
 21 | 
 22 | #include <linux/fs.h>
 23 | #include <linux/bitops.h>
 24 | #include <linux/slab.h>
 25 | #include "pmfs.h"
 26 | 
 27 | struct scan_bitmap {
 28 | 	unsigned long bitmap_4k_size;
 29 | 	unsigned long bitmap_2M_size;
 30 | 	unsigned long bitmap_1G_size;
 31 | 	unsigned long *bitmap_4k;
 32 | 	unsigned long *bitmap_2M;
 33 | 	unsigned long *bitmap_1G;
 34 | };
 35 | 
 36 | static void pmfs_clear_datablock_inode(struct super_block *sb)
 37 | {
 38 | 	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
 39 | 	pmfs_transaction_t *trans;
 40 | 
 41 | 	/* 2 log entry for inode */
 42 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
 43 | 	if (IS_ERR(trans))
 44 | 		return;
 45 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
 46 | 
 47 | 	pmfs_memunlock_inode(sb, pi);
 48 | 	memset(pi, 0, MAX_DATA_PER_LENTRY);
 49 | 	pmfs_memlock_inode(sb, pi);
 50 | 
 51 | 	/* commit the transaction */
 52 | 	pmfs_commit_transaction(sb, trans);
 53 | }
 54 | 
 55 | static void pmfs_init_blockmap_from_inode(struct super_block *sb)
 56 | {
 57 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 58 | 	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
 59 | 	struct pmfs_blocknode_lowhigh *p = NULL;
 60 | 	struct pmfs_blocknode *blknode;
 61 | 	unsigned long index;
 62 | 	unsigned long blocknr;
 63 | 	unsigned long i;
 64 | 	unsigned long num_blocknode;
 65 | 	u64 bp;
 66 | 
 67 | 	num_blocknode = sbi->num_blocknode_allocated;
 68 | 	sbi->num_blocknode_allocated = 0;
 69 | 	for (i=0; i<num_blocknode; i++) {
 70 | 		index = i & 0xFF;
 71 | 		if (index == 0) {
 72 | 			/* Find and get new data block */
 73 | 			blocknr = i >> 8; /* 256 Entries in a block */
 74 | 			bp = __pmfs_find_data_block(sb, pi, blocknr);
 75 | 			p = pmfs_get_block(sb, bp);
 76 | 		}
 77 | 		PMFS_ASSERT(p);
 78 | 		blknode = pmfs_alloc_blocknode(sb);
 79 | 		if (blknode == NULL)
 80 |                 	PMFS_ASSERT(0);
 81 | 		blknode->block_low = le64_to_cpu(p[index].block_low);
 82 | 		blknode->block_high = le64_to_cpu(p[index].block_high);
 83 | 		list_add_tail(&blknode->link, &sbi->block_inuse_head);
 84 | 	}
 85 | }
 86 | 
 87 | static bool pmfs_can_skip_full_scan(struct super_block *sb)
 88 | {
 89 | 	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
 90 | 	struct pmfs_super_block *super = pmfs_get_super(sb);
 91 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 92 | 	__le64 root;
 93 | 	unsigned int height, btype;
 94 | 	unsigned long last_blocknr;
 95 | 
 96 | 	if (!pi->root)
 97 | 		return false;
 98 | 
 99 | 	sbi->num_blocknode_allocated =
100 | 		le64_to_cpu(super->s_num_blocknode_allocated);
101 | 	sbi->num_free_blocks = le64_to_cpu(super->s_num_free_blocks);
102 | 	sbi->s_inodes_count = le32_to_cpu(super->s_inodes_count);
103 | 	sbi->s_free_inodes_count = le32_to_cpu(super->s_free_inodes_count);
104 | 	sbi->s_inodes_used_count = le32_to_cpu(super->s_inodes_used_count);
105 | 	sbi->s_free_inode_hint = le32_to_cpu(super->s_free_inode_hint);
106 | 
107 | 	pmfs_init_blockmap_from_inode(sb);
108 | 
109 | 	root = pi->root;
110 | 	height = pi->height;
111 | 	btype = pi->i_blk_type;
112 | 	/* pi->i_size can not be zero */
113 | 	last_blocknr = (le64_to_cpu(pi->i_size) - 1) >>
114 | 					pmfs_inode_blk_shift(pi);
115 | 
116 | 	/* Clearing the datablock inode */
117 | 	pmfs_clear_datablock_inode(sb);
118 | 
119 | 	pmfs_free_inode_subtree(sb, root, height, btype, last_blocknr);
120 | 
121 | 	return true;
122 | }
123 | 
124 | 
125 | static int pmfs_allocate_datablock_block_inode(pmfs_transaction_t *trans,
126 | 	struct super_block *sb, struct pmfs_inode *pi, unsigned long num_blocks)
127 | {
128 | 	int errval;
129 | 	
130 | 	pmfs_memunlock_inode(sb, pi);
131 | 	pi->i_mode = 0;
132 | 	pi->i_links_count = cpu_to_le16(1);
133 | 	pi->i_blk_type = PMFS_BLOCK_TYPE_4K;
134 | 	pi->i_flags = 0;
135 | 	pi->height = 0;
136 | 	pi->i_dtime = 0; 
137 | 	pi->i_size = cpu_to_le64(num_blocks << sb->s_blocksize_bits);
138 | 	pmfs_memlock_inode(sb, pi);
139 | 
140 | 	errval = __pmfs_alloc_blocks(trans, sb, pi, 0, num_blocks, false);
141 | 
142 | 	return errval;
143 | }
144 | 
145 | void pmfs_save_blocknode_mappings(struct super_block *sb)
146 | {
147 | 	unsigned long num_blocks, blocknr;
148 | 	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
149 | 	struct pmfs_blocknode_lowhigh *p;
150 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
151 | 	struct list_head *head = &(sbi->block_inuse_head);
152 | 	struct pmfs_blocknode *i;
153 | 	struct pmfs_super_block *super;
154 | 	pmfs_transaction_t *trans;
155 | 	u64 bp;
156 | 	int j, k;
157 | 	int errval;
158 | 	
159 | 	num_blocks = ((sbi->num_blocknode_allocated * sizeof(struct 
160 | 		pmfs_blocknode_lowhigh) - 1) >> sb->s_blocksize_bits) + 1;
161 | 
162 | 	/* 2 log entry for inode, 2 lentry for super-block */
163 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + MAX_SB_LENTRIES);
164 | 	if (IS_ERR(trans))
165 | 		return;
166 | 
167 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
168 | 
169 | 	errval = pmfs_allocate_datablock_block_inode(trans, sb, pi, num_blocks);
170 | 
171 | 	if (errval != 0) {
172 | 		pmfs_dbg("Error saving the blocknode mappings: %d\n", errval);
173 | 		pmfs_abort_transaction(sb, trans);
174 | 		return;
175 | 	}
176 | 
177 | 	j = 0;
178 | 	k = 0;
179 | 	p = NULL;
180 | 	list_for_each_entry(i, head, link) {
181 | 		blocknr = k >> 8;
182 | 		if (j == 0) {
183 | 			/* Find, get and unlock new data block */
184 | 			bp = __pmfs_find_data_block(sb, pi, blocknr);
185 | 			p = pmfs_get_block(sb, bp); 
186 | 			pmfs_memunlock_block(sb, p);
187 | 		}
188 | 		p[j].block_low = cpu_to_le64(i->block_low);
189 | 		p[j].block_high = cpu_to_le64(i->block_high);
190 | 		j++;
191 | 
192 | 		if (j == 256) {
193 | 			j = 0;
194 | 			/* Lock the data block */
195 | 			pmfs_memlock_block(sb, p);
196 | 			pmfs_flush_buffer(p, 4096, false);
197 | 		}
198 | 		
199 | 		k++;
200 | 	}
201 | 	
202 | 	/* Lock the block */	
203 | 	if (j) {
204 | 		pmfs_flush_buffer(p, j << 4, false);
205 | 		pmfs_memlock_block(sb, p);	
206 | 	}	
207 | 
208 | 	/* 
209 | 	 * save the total allocated blocknode mappings 
210 | 	 * in super block
211 | 	 */
212 | 	super = pmfs_get_super(sb);
213 | 	pmfs_add_logentry(sb, trans, &super->s_wtime,
214 | 			PMFS_FAST_MOUNT_FIELD_SIZE, LE_DATA);
215 | 
216 | 	pmfs_memunlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
217 | 
218 | 	super->s_wtime = cpu_to_le32(get_seconds());
219 | 	super->s_num_blocknode_allocated = 
220 | 			cpu_to_le64(sbi->num_blocknode_allocated);
221 | 	super->s_num_free_blocks = cpu_to_le64(sbi->num_free_blocks);
222 | 	super->s_inodes_count = cpu_to_le32(sbi->s_inodes_count);
223 | 	super->s_free_inodes_count = cpu_to_le32(sbi->s_free_inodes_count);
224 | 	super->s_inodes_used_count = cpu_to_le32(sbi->s_inodes_used_count);
225 | 	super->s_free_inode_hint = cpu_to_le32(sbi->s_free_inode_hint);
226 | 
227 | 	pmfs_memlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
228 | 	/* commit the transaction */
229 | 	pmfs_commit_transaction(sb, trans);
230 | }
231 | 
232 | static void pmfs_inode_crawl_recursive(struct super_block *sb,
233 | 				struct scan_bitmap *bm, unsigned long block,
234 | 				u32 height, u8 btype)
235 | {
236 | 	__le64 *node;
237 | 	unsigned int i;
238 | 
239 | 	if (height == 0) {
240 | 		/* This is the data block */
241 | 		if (btype == PMFS_BLOCK_TYPE_4K) {
242 | 			set_bit(block >> PAGE_SHIFT, bm->bitmap_4k);
243 | 		} else if (btype == PMFS_BLOCK_TYPE_2M) {
244 | 			set_bit(block >> PAGE_SHIFT_2M, bm->bitmap_2M);
245 | 		} else {
246 | 			set_bit(block >> PAGE_SHIFT_1G, bm->bitmap_1G);
247 | 		}
248 | 		return;
249 | 	}
250 | 
251 | 	node = pmfs_get_block(sb, block);
252 | 	set_bit(block >> PAGE_SHIFT, bm->bitmap_4k);
253 | 	for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
254 | 		if (node[i] == 0)
255 | 			continue;
256 | 		pmfs_inode_crawl_recursive(sb, bm,
257 | 			le64_to_cpu(node[i]), height - 1, btype);
258 | 	}
259 | }
260 | 
261 | static inline void pmfs_inode_crawl(struct super_block *sb,
262 | 				struct scan_bitmap *bm, struct pmfs_inode *pi)
263 | {
264 | 	if (pi->root == 0)
265 | 		return;
266 | 	pmfs_inode_crawl_recursive(sb, bm, le64_to_cpu(pi->root), pi->height,
267 | 					pi->i_blk_type);
268 | }
269 | 
270 | static void pmfs_inode_table_crawl_recursive(struct super_block *sb,
271 | 				struct scan_bitmap *bm, unsigned long block,
272 | 				u32 height, u32 btype)
273 | {
274 | 	__le64 *node;
275 | 	unsigned int i;
276 | 	struct pmfs_inode *pi;
277 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
278 | 	
279 | 	node = pmfs_get_block(sb, block);
280 | 
281 | 	if (height == 0) {
282 | 		unsigned int inodes_per_block = INODES_PER_BLOCK(btype);
283 | 		if (likely(btype == PMFS_BLOCK_TYPE_2M))
284 | 			set_bit(block >> PAGE_SHIFT_2M, bm->bitmap_2M);
285 | 		else
286 | 			set_bit(block >> PAGE_SHIFT, bm->bitmap_4k);
287 | 
288 | 		sbi->s_inodes_count += inodes_per_block;
289 | 		for (i = 0; i < inodes_per_block; i++) {
290 | 			pi = (struct pmfs_inode *)((void *)node +
291 |                                                         PMFS_INODE_SIZE * i);
292 | 			if (le16_to_cpu(pi->i_links_count) == 0 &&
293 |                         	(le16_to_cpu(pi->i_mode) == 0 ||
294 |                          	le32_to_cpu(pi->i_dtime))) {
295 | 					/* Empty inode */
296 | 					continue;
297 | 			}
298 | 			sbi->s_inodes_used_count++;
299 | 			pmfs_inode_crawl(sb, bm, pi);
300 | 		}
301 | 		return;
302 | 	}
303 | 
304 | 	set_bit(block >> PAGE_SHIFT, bm->bitmap_4k);
305 | 	for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
306 | 		if (node[i] == 0)
307 | 			continue;
308 | 		pmfs_inode_table_crawl_recursive(sb, bm,
309 | 			le64_to_cpu(node[i]), height - 1, btype);
310 | 	}
311 | }
312 | 
313 | static int pmfs_alloc_insert_blocknode_map(struct super_block *sb,
314 | 	unsigned long low, unsigned long high)
315 | {
316 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
317 | 	struct list_head *head = &(sbi->block_inuse_head);
318 | 	struct pmfs_blocknode *i, *next_i;
319 | 	struct pmfs_blocknode *free_blocknode= NULL;
320 | 	unsigned long num_blocks = 0;
321 | 	struct pmfs_blocknode *curr_node;
322 | 	int errval = 0;
323 | 	bool found = 0;
324 | 	unsigned long next_block_low;
325 | 	unsigned long new_block_low;
326 | 	unsigned long new_block_high;
327 | 
328 | 	//num_blocks = pmfs_get_numblocks(btype);
329 | 
330 | 	new_block_low = low;
331 | 	new_block_high = high;
332 | 	num_blocks = high - low + 1;
333 | 
334 | 	list_for_each_entry(i, head, link) {
335 | 		if (i->link.next == head) {
336 | 			next_i = NULL;
337 | 			next_block_low = sbi->block_end;
338 | 		} else {
339 | 			next_i = list_entry(i->link.next, typeof(*i), link);
340 | 			next_block_low = next_i->block_low;
341 | 		}
342 | 
343 | 
344 | 		if (new_block_high >= next_block_low) {
345 | 			/* Does not fit - skip to next blocknode */
346 | 			continue;
347 | 		}
348 | 
349 | 		if ((new_block_low == (i->block_high + 1)) &&
350 | 			(new_block_high == (next_block_low - 1)))
351 | 		{
352 | 			/* Fill the gap completely */
353 | 			if (next_i) {
354 | 				i->block_high = next_i->block_high;
355 | 				list_del(&next_i->link);
356 | 				free_blocknode = next_i;
357 | 			} else {
358 | 				i->block_high = new_block_high;
359 | 			}
360 | 			found = 1;
361 | 			break;
362 | 		}
363 | 
364 | 		if ((new_block_low == (i->block_high + 1)) &&
365 | 			(new_block_high < (next_block_low - 1))) {
366 | 			/* Aligns to left */
367 | 			i->block_high = new_block_high;
368 | 			found = 1;
369 | 			break;
370 | 		}
371 | 
372 | 		if ((new_block_low > (i->block_high + 1)) &&
373 | 			(new_block_high == (next_block_low - 1))) {
374 | 			/* Aligns to right */
375 | 			if (next_i) {
376 | 				/* right node exist */
377 | 				next_i->block_low = new_block_low;
378 | 			} else {
379 | 				/* right node does NOT exist */
380 | 				curr_node = pmfs_alloc_blocknode(sb);
381 | 				PMFS_ASSERT(curr_node);
382 | 				if (curr_node == NULL) {
383 | 					errval = -ENOSPC;
384 | 					break;
385 | 				}
386 | 				curr_node->block_low = new_block_low;
387 | 				curr_node->block_high = new_block_high;
388 | 				list_add(&curr_node->link, &i->link);
389 | 			}
390 | 			found = 1;
391 | 			break;
392 | 		}
393 | 
394 | 		if ((new_block_low > (i->block_high + 1)) &&
395 | 			(new_block_high < (next_block_low - 1))) {
396 | 			/* Aligns somewhere in the middle */
397 | 			curr_node = pmfs_alloc_blocknode(sb);
398 | 			PMFS_ASSERT(curr_node);
399 | 			if (curr_node == NULL) {
400 | 				errval = -ENOSPC;
401 | 				break;
402 | 			}
403 | 			curr_node->block_low = new_block_low;
404 | 			curr_node->block_high = new_block_high;
405 | 			list_add(&curr_node->link, &i->link);
406 | 			found = 1;
407 | 			break;
408 | 		}
409 | 	}
410 | 	
411 | 	if (found == 1) {
412 | 		sbi->num_free_blocks -= num_blocks;
413 | 	}	
414 | 
415 | 	if (free_blocknode)
416 | 		pmfs_free_blocknode(sb, free_blocknode);
417 | 
418 | 	if (found == 0) {
419 | 		return -ENOSPC;
420 | 	}
421 | 
422 | 
423 | 	return errval;
424 | }
425 | 
426 | static int __pmfs_build_blocknode_map(struct super_block *sb,
427 | 	unsigned long *bitmap, unsigned long bsize, unsigned long scale)
428 | {
429 | 	unsigned long next = 1;
430 | 	unsigned long low = 0;
431 | 
432 | 	while (1) {
433 | 		next = find_next_bit(bitmap, bsize, next);
434 | 		if (next == bsize)
435 | 			break;
436 | 		low = next;
437 | 		next = find_next_zero_bit(bitmap, bsize, next);
438 | 		if (pmfs_alloc_insert_blocknode_map(sb, low << scale ,
439 | 				(next << scale) - 1)) {
440 | 			printk("PMFS: Error could not insert 0x%lx-0x%lx\n",
441 | 				low << scale, ((next << scale) - 1));
442 | 		}
443 | 		if (next == bsize)
444 | 			break;
445 | 	}
446 | 	return 0;
447 | }
448 | 	
449 | static void pmfs_build_blocknode_map(struct super_block *sb,
450 | 							struct scan_bitmap *bm)
451 | {
452 | 	__pmfs_build_blocknode_map(sb, bm->bitmap_4k, bm->bitmap_4k_size * 8,
453 | 		PAGE_SHIFT - 12);
454 | 	__pmfs_build_blocknode_map(sb, bm->bitmap_2M, bm->bitmap_2M_size * 8,
455 | 		PAGE_SHIFT_2M - 12);
456 | 	__pmfs_build_blocknode_map(sb, bm->bitmap_1G, bm->bitmap_1G_size * 8,
457 | 		PAGE_SHIFT_1G - 12);
458 | }
459 | 
460 | int pmfs_setup_blocknode_map(struct super_block *sb)
461 | {
462 | 	struct pmfs_super_block *super = pmfs_get_super(sb);
463 | 	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
464 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
465 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
466 | 	struct scan_bitmap bm;
467 | 	unsigned long initsize = le64_to_cpu(super->s_size);
468 | 	bool value = false;
469 | 	timing_t start, end;
470 | 
471 | 	/* Always check recovery time */
472 | 	if (measure_timing == 0)
473 | 		getrawmonotonic(&start);
474 | 
475 | 	PMFS_START_TIMING(recovery_t, start);
476 | 
477 | 	mutex_init(&sbi->inode_table_mutex);
478 | 	sbi->block_start = (unsigned long)0;
479 | 	sbi->block_end = ((unsigned long)(initsize) >> PAGE_SHIFT);
480 | 	
481 | 	value = pmfs_can_skip_full_scan(sb);
482 | 	if (value) {
483 | 		pmfs_dbg_verbose("PMFS: Skipping full scan of inodes...\n");
484 | 		goto end;
485 | 	}
486 | 
487 | 	pmfs_dbg("PMFS: Performing failure recovery\n");
488 | 	bm.bitmap_4k_size = (initsize >> (PAGE_SHIFT + 0x3)) + 1;
489 | 	bm.bitmap_2M_size = (initsize >> (PAGE_SHIFT_2M + 0x3)) + 1;
490 | 	bm.bitmap_1G_size = (initsize >> (PAGE_SHIFT_1G + 0x3)) + 1;
491 | 
492 | 	/* Alloc memory to hold the block alloc bitmap */
493 | 	bm.bitmap_4k = kzalloc(bm.bitmap_4k_size, GFP_KERNEL);
494 | 	bm.bitmap_2M = kzalloc(bm.bitmap_2M_size, GFP_KERNEL);
495 | 	bm.bitmap_1G = kzalloc(bm.bitmap_1G_size, GFP_KERNEL);
496 | 
497 | 	if (!bm.bitmap_4k || !bm.bitmap_2M || !bm.bitmap_1G)
498 | 		goto skip;
499 | 	
500 | 	/* Clearing the datablock inode */
501 | 	pmfs_clear_datablock_inode(sb);
502 | 
503 | 	pmfs_inode_table_crawl_recursive(sb, &bm, le64_to_cpu(pi->root),
504 | 						pi->height, pi->i_blk_type);
505 | 
506 | 	/* Reserving tow inodes - Inode 0 and Inode for datablock */
507 | 	sbi->s_free_inodes_count = sbi->s_inodes_count -  
508 | 		(sbi->s_inodes_used_count + 2);
509 | 	
510 | 	/* set the block 0 as this is used */
511 | 	sbi->s_free_inode_hint = PMFS_FREE_INODE_HINT_START;
512 | 
513 | 	/* initialize the num_free_blocks to */
514 | 	sbi->num_free_blocks = ((unsigned long)(initsize) >> PAGE_SHIFT);
515 | 	pmfs_init_blockmap(sb, le64_to_cpu(journal->base) + sbi->jsize);
516 | 
517 | 	pmfs_build_blocknode_map(sb, &bm);
518 | 
519 | skip:
520 | 	
521 | 	kfree(bm.bitmap_4k);
522 | 	kfree(bm.bitmap_2M);
523 | 	kfree(bm.bitmap_1G);
524 | 
525 | end:
526 | 	PMFS_END_TIMING(recovery_t, start);
527 | 	if (measure_timing == 0) {
528 | 		getrawmonotonic(&end);
529 | 		Timingstats[recovery_t] +=
530 | 			(end.tv_sec - start.tv_sec) * 1000000000 +
531 | 			(end.tv_nsec - start.tv_nsec);
532 | 	}
533 | 
534 | 	return 0;
535 | }
536 | 


--------------------------------------------------------------------------------
/xip.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * XIP operations.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * This file is licensed under the terms of the GNU General Public
  9 |  * License version 2. This program is licensed "as is" without any
 10 |  * warranty of any kind, whether express or implied.
 11 |  */
 12 | 
 13 | #include <linux/mm.h>
 14 | #include <linux/sched.h>
 15 | #include <linux/fs.h>
 16 | #include <linux/buffer_head.h>
 17 | #include <asm/cpufeature.h>
 18 | #include <asm/pgtable.h>
 19 | #include "pmfs.h"
 20 | #include "xip.h"
 21 | 
 22 | static ssize_t
 23 | do_xip_mapping_read(struct address_space *mapping,
 24 | 		    struct file_ra_state *_ra,
 25 | 		    struct file *filp,
 26 | 		    char __user *buf,
 27 | 		    size_t len,
 28 | 		    loff_t *ppos)
 29 | {
 30 | 	struct inode *inode = mapping->host;
 31 | 	pgoff_t index, end_index;
 32 | 	unsigned long offset;
 33 | 	loff_t isize, pos;
 34 | 	size_t copied = 0, error = 0;
 35 | 	timing_t memcpy_time;
 36 | 
 37 | 	pos = *ppos;
 38 | 	index = pos >> PAGE_SHIFT;
 39 | 	offset = pos & ~PAGE_MASK;
 40 | 
 41 | 	isize = i_size_read(inode);
 42 | 	if (!isize)
 43 | 		goto out;
 44 | 
 45 | 	end_index = (isize - 1) >> PAGE_SHIFT;
 46 | 	do {
 47 | 		unsigned long nr, left;
 48 | 		void *xip_mem;
 49 | 		unsigned long xip_pfn;
 50 | 		int zero = 0;
 51 | 
 52 | 		/* nr is the maximum number of bytes to copy from this page */
 53 | 		nr = PAGE_SIZE;
 54 | 		if (index >= end_index) {
 55 | 			if (index > end_index)
 56 | 				goto out;
 57 | 			nr = ((isize - 1) & ~PAGE_MASK) + 1;
 58 | 			if (nr <= offset) {
 59 | 				goto out;
 60 | 			}
 61 | 		}
 62 | 		nr = nr - offset;
 63 | 		if (nr > len - copied)
 64 | 			nr = len - copied;
 65 | 
 66 | 		error = pmfs_get_xip_mem(mapping, index, 0,
 67 | 					&xip_mem, &xip_pfn);
 68 | 		if (unlikely(error)) {
 69 | 			if (error == -ENODATA) {
 70 | 				/* sparse */
 71 | 				zero = 1;
 72 | 			} else
 73 | 				goto out;
 74 | 		}
 75 | 
 76 | 		/* If users can be writing to this page using arbitrary
 77 | 		 * virtual addresses, take care about potential aliasing
 78 | 		 * before reading the page on the kernel side.
 79 | 		 */
 80 | 		if (mapping_writably_mapped(mapping))
 81 | 			/* address based flush */ ;
 82 | 
 83 | 		/*
 84 | 		 * Ok, we have the mem, so now we can copy it to user space...
 85 | 		 *
 86 | 		 * The actor routine returns how many bytes were actually used..
 87 | 		 * NOTE! This may not be the same as how much of a user buffer
 88 | 		 * we filled up (we may be padding etc), so we can only update
 89 | 		 * "pos" here (the actor routine has to update the user buffer
 90 | 		 * pointers and the remaining count).
 91 | 		 */
 92 | 		PMFS_START_TIMING(memcpy_r_t, memcpy_time);
 93 | 		if (!zero)
 94 | 			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
 95 | 		else
 96 | 			left = __clear_user(buf + copied, nr);
 97 | 		PMFS_END_TIMING(memcpy_r_t, memcpy_time);
 98 | 
 99 | 		if (left) {
100 | 			error = -EFAULT;
101 | 			goto out;
102 | 		}
103 | 
104 | 		copied += (nr - left);
105 | 		offset += (nr - left);
106 | 		index += offset >> PAGE_SHIFT;
107 | 		offset &= ~PAGE_MASK;
108 | 	} while (copied < len);
109 | 
110 | out:
111 | 	*ppos = pos + copied;
112 | 	if (filp)
113 | 		file_accessed(filp);
114 | 
115 | 	return (copied ? copied : error);
116 | }
117 | 
118 | ssize_t
119 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
120 | {
121 | 	if (!access_ok(VERIFY_WRITE, buf, len))
122 | 		return -EFAULT;
123 | 
124 | 	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
125 | 			    buf, len, ppos);
126 | }
127 | 
128 | /*
129 |  * Wrappers. We need to use the rcu read lock to avoid
130 |  * concurrent truncate operation. No problem for write because we held
131 |  * i_mutex.
132 |  */
133 | ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf,
134 | 			    size_t len, loff_t *ppos)
135 | {
136 | 	ssize_t res;
137 | 	timing_t xip_read_time;
138 | 
139 | 	PMFS_START_TIMING(xip_read_t, xip_read_time);
140 | //	rcu_read_lock();
141 | 	res = xip_file_read(filp, buf, len, ppos);
142 | //	rcu_read_unlock();
143 | 	PMFS_END_TIMING(xip_read_t, xip_read_time);
144 | 	return res;
145 | }
146 | 
147 | static inline void pmfs_flush_edge_cachelines(loff_t pos, ssize_t len,
148 | 	void *start_addr)
149 | {
150 | 	if (unlikely(pos & 0x7))
151 | 		pmfs_flush_buffer(start_addr, 1, false);
152 | 	if (unlikely(((pos + len) & 0x7) && ((pos & (CACHELINE_SIZE - 1)) !=
153 | 			((pos + len) & (CACHELINE_SIZE - 1)))))
154 | 		pmfs_flush_buffer(start_addr + len, 1, false);
155 | }
156 | 
157 | static inline size_t memcpy_to_nvmm(char *kmem, loff_t offset,
158 | 	const char __user *buf, size_t bytes)
159 | {
160 | 	size_t copied;
161 | 
162 | 	if (support_clwb) {
163 | 		copied = bytes - __copy_from_user(kmem + offset, buf, bytes);
164 | 		pmfs_flush_buffer(kmem + offset, copied, 0);
165 | 	} else {
166 | 		copied = bytes - __copy_from_user_inatomic_nocache(kmem +
167 | 						offset, buf, bytes);
168 | 	}
169 | 
170 | 	return copied;
171 | }
172 | 
173 | static ssize_t
174 | __pmfs_xip_file_write(struct address_space *mapping, const char __user *buf,
175 |           size_t count, loff_t pos, loff_t *ppos)
176 | {
177 | 	struct inode    *inode = mapping->host;
178 | 	struct super_block *sb = inode->i_sb;
179 | 	long        status = 0;
180 | 	size_t      bytes;
181 | 	ssize_t     written = 0;
182 | 	struct pmfs_inode *pi;
183 | 	timing_t memcpy_time, write_time;
184 | 
185 | 	PMFS_START_TIMING(internal_write_t, write_time);
186 | 	pi = pmfs_get_inode(sb, inode->i_ino);
187 | 	do {
188 | 		unsigned long index;
189 | 		unsigned long offset;
190 | 		size_t copied;
191 | 		void *xmem;
192 | 		unsigned long xpfn;
193 | 
194 | 		offset = (pos & (sb->s_blocksize - 1)); /* Within page */
195 | 		index = pos >> sb->s_blocksize_bits;
196 | 		bytes = sb->s_blocksize - offset;
197 | 		if (bytes > count)
198 | 			bytes = count;
199 | 
200 | 		status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn);
201 | 		if (status)
202 | 			break;
203 | 
204 | 		PMFS_START_TIMING(memcpy_w_t, memcpy_time);
205 | 		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1);
206 | 		copied = memcpy_to_nvmm((char *)xmem, offset, buf, bytes);
207 | 		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0);
208 | 		PMFS_END_TIMING(memcpy_w_t, memcpy_time);
209 | 
210 | 		/* if start or end dest address is not 8 byte aligned, 
211 | 	 	 * __copy_from_user_inatomic_nocache uses cacheable instructions
212 | 	 	 * (instead of movnti) to write. So flush those cachelines. */
213 | 		pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
214 | 
215 |         	if (likely(copied > 0)) {
216 | 			status = copied;
217 | 
218 | 			if (status >= 0) {
219 | 				written += status;
220 | 				count -= status;
221 | 				pos += status;
222 | 				buf += status;
223 | 			}
224 | 		}
225 | 		if (unlikely(copied != bytes))
226 | 			if (status >= 0)
227 | 				status = -EFAULT;
228 | 		if (status < 0)
229 | 			break;
230 | 	} while (count);
231 | 	*ppos = pos;
232 | 	/*
233 |  	* No need to use i_size_read() here, the i_size
234 |  	* cannot change under us because we hold i_mutex.
235 |  	*/
236 | 	if (pos > inode->i_size) {
237 | 		i_size_write(inode, pos);
238 | 		pmfs_update_isize(inode, pi);
239 | 	}
240 | 
241 | 	PMFS_END_TIMING(internal_write_t, write_time);
242 | 	return written ? written : status;
243 | }
244 | 
245 | /* optimized path for file write that doesn't require a transaction. In this
246 |  * path we don't need to allocate any new data blocks. So the only meta-data
247 |  * modified in path is inode's i_size, i_ctime, and i_mtime fields */
248 | static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode,
249 | 	struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos,
250 | 	loff_t *ppos, u64 block)
251 | {
252 | 	void *xmem = pmfs_get_block(sb, block);
253 | 	size_t copied, ret = 0, offset;
254 | 	timing_t memcpy_time;
255 | 
256 | 	offset = pos & (sb->s_blocksize - 1);
257 | 
258 | 	PMFS_START_TIMING(memcpy_w_t, memcpy_time);
259 | 	pmfs_xip_mem_protect(sb, xmem + offset, count, 1);
260 | 	copied = memcpy_to_nvmm((char *)xmem, offset, buf, count);
261 | 	pmfs_xip_mem_protect(sb, xmem + offset, count, 0);
262 | 	PMFS_END_TIMING(memcpy_w_t, memcpy_time);
263 | 
264 | 	pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
265 | 
266 | 	if (likely(copied > 0)) {
267 | 		pos += copied;
268 | 		ret = copied;
269 | 	}
270 | 	if (unlikely(copied != count && copied == 0))
271 | 		ret = -EFAULT;
272 | 	*ppos = pos;
273 | 	inode->i_ctime = inode->i_mtime = current_time(inode);
274 | 	if (pos > inode->i_size) {
275 | 		/* make sure written data is persistent before updating
276 | 	 	* time and size */
277 | 		PERSISTENT_MARK();
278 | 		i_size_write(inode, pos);
279 | 		PERSISTENT_BARRIER();
280 | 		pmfs_memunlock_inode(sb, pi);
281 | 		pmfs_update_time_and_size(inode, pi);
282 | 		pmfs_memlock_inode(sb, pi);
283 | 	} else {
284 | 		u64 c_m_time;
285 | 		/* update c_time and m_time atomically. We don't need to make the data
286 | 		 * persistent because the expectation is that the close() or an explicit
287 | 		 * fsync will do that. */
288 | 		c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF);
289 | 		c_m_time = c_m_time | (c_m_time << 32);
290 | 		pmfs_memunlock_inode(sb, pi);
291 | 		pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8);
292 | 		pmfs_memlock_inode(sb, pi);
293 | 	}
294 | 	pmfs_flush_buffer(pi, 1, false);
295 | 	return ret;
296 | }
297 | 
298 | /*
299 |  * blk_off is used in different ways depending on whether the edge block is
300 |  * at the beginning or end of the write. If it is at the beginning, we zero from
301 |  * start-of-block to 'blk_off'. If it is the end block, we zero from 'blk_off' to
302 |  * end-of-block
303 |  */
304 | static inline void pmfs_clear_edge_blk (struct super_block *sb, struct
305 | 	pmfs_inode *pi, bool new_blk, unsigned long block, size_t blk_off,
306 | 	bool is_end_blk)
307 | {
308 | 	void *ptr;
309 | 	size_t count;
310 | 	unsigned long blknr;
311 | 
312 | 	if (new_blk) {
313 | 		blknr = block >> (pmfs_inode_blk_shift(pi) -
314 | 			sb->s_blocksize_bits);
315 | 		ptr = pmfs_get_block(sb, __pmfs_find_data_block(sb, pi, blknr));
316 | 		if (ptr != NULL) {
317 | 			if (is_end_blk) {
318 | 				ptr = ptr + blk_off - (blk_off % 8);
319 | 				count = pmfs_inode_blk_size(pi) -
320 | 					blk_off + (blk_off % 8);
321 | 			} else
322 | 				count = blk_off + (8 - (blk_off % 8));
323 | 			pmfs_memunlock_range(sb, ptr,  pmfs_inode_blk_size(pi));
324 | 			memset_nt(ptr, 0, count);
325 | 			pmfs_memlock_range(sb, ptr,  pmfs_inode_blk_size(pi));
326 | 		}
327 | 	}
328 | }
329 | 
330 | ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
331 |           size_t len, loff_t *ppos)
332 | {
333 | 	struct address_space *mapping = filp->f_mapping;
334 | 	struct inode    *inode = mapping->host;
335 | 	struct super_block *sb = inode->i_sb;
336 | 	pmfs_transaction_t *trans;
337 | 	struct pmfs_inode *pi;
338 | 	ssize_t     written = 0;
339 | 	loff_t pos;
340 | 	u64 block;
341 | 	bool new_sblk = false, new_eblk = false;
342 | 	size_t count, offset, eblk_offset, ret;
343 | 	unsigned long start_blk, end_blk, num_blocks, max_logentries;
344 | 	bool same_block;
345 | 	timing_t xip_write_time, xip_write_fast_time;
346 | 
347 | 	PMFS_START_TIMING(xip_write_t, xip_write_time);
348 | 
349 | 	sb_start_write(inode->i_sb);
350 | 	inode_lock(inode);
351 | 
352 | 	if (!access_ok(VERIFY_READ, buf, len)) {
353 | 		ret = -EFAULT;
354 | 		goto out;
355 | 	}
356 | 	pos = *ppos;
357 | 	count = len;
358 | 	if (count == 0) {
359 | 		ret = 0;
360 | 		goto out;
361 | 	}
362 | 
363 | 	pi = pmfs_get_inode(sb, inode->i_ino);
364 | 
365 | 	offset = pos & (sb->s_blocksize - 1);
366 | 	num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
367 | 	/* offset in the actual block size block */
368 | 	offset = pos & (pmfs_inode_blk_size(pi) - 1);
369 | 	start_blk = pos >> sb->s_blocksize_bits;
370 | 	end_blk = start_blk + num_blocks - 1;
371 | 
372 | 	block = pmfs_find_data_block(inode, start_blk);
373 | 
374 | 	/* Referring to the inode's block size, not 4K */
375 | 	same_block = (((count + offset - 1) >>
376 | 			pmfs_inode_blk_shift(pi)) == 0) ? 1 : 0;
377 | 	if (block && same_block) {
378 | 		PMFS_START_TIMING(xip_write_fast_t, xip_write_fast_time);
379 | 		ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos,
380 | 			ppos, block);
381 | 		PMFS_END_TIMING(xip_write_fast_t, xip_write_fast_time);
382 | 		goto out;
383 | 	}
384 | 	max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2;
385 | 	if (max_logentries > MAX_METABLOCK_LENTRIES)
386 | 		max_logentries = MAX_METABLOCK_LENTRIES;
387 | 
388 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries);
389 | 	if (IS_ERR(trans)) {
390 | 		ret = PTR_ERR(trans);
391 | 		goto out;
392 | 	}
393 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
394 | 
395 | 	ret = file_remove_privs(filp);
396 | 	if (ret) {
397 | 		pmfs_abort_transaction(sb, trans);
398 | 		goto out;
399 | 	}
400 | 	inode->i_ctime = inode->i_mtime = current_time(inode);
401 | 	pmfs_update_time(inode, pi);
402 | 
403 | 	/* We avoid zeroing the alloc'd range, which is going to be overwritten
404 | 	 * by this system call anyway */
405 | 	if (offset != 0) {
406 | 		if (pmfs_find_data_block(inode, start_blk) == 0)
407 | 		    new_sblk = true;
408 | 	}
409 | 
410 | 	eblk_offset = (pos + count) & (pmfs_inode_blk_size(pi) - 1);
411 | 	if ((eblk_offset != 0) &&
412 | 			(pmfs_find_data_block(inode, end_blk) == 0))
413 | 		new_eblk = true;
414 | 
415 | 	/* don't zero-out the allocated blocks */
416 | 	pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false);
417 | 
418 | 	/* now zero out the edge blocks which will be partially written */
419 | 	pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk, offset, false);
420 | 	pmfs_clear_edge_blk(sb, pi, new_eblk, end_blk, eblk_offset, true);
421 | 
422 | 	written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos);
423 | 	if (written < 0 || written != count)
424 | 		pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld"
425 | 			" pos %llx start_blk %lx num_blocks %lx\n",
426 | 			written, count, pos, start_blk, num_blocks);
427 | 
428 | 	pmfs_commit_transaction(sb, trans);
429 | 	ret = written;
430 | out:
431 | 	inode_unlock(inode);
432 | 	sb_end_write(inode->i_sb);
433 | 	PMFS_END_TIMING(xip_write_t, xip_write_time);
434 | 	return ret;
435 | }
436 | 
437 | /* OOM err return with xip file fault handlers doesn't mean anything.
438 |  * It would just cause the OS to go an unnecessary killing spree !
439 |  */
440 | static int __pmfs_xip_file_fault(struct vm_area_struct *vma,
441 | 				  struct vm_fault *vmf)
442 | {
443 | 	struct address_space *mapping = vma->vm_file->f_mapping;
444 | 	struct inode *inode = mapping->host;
445 | 	pgoff_t size;
446 | 	void *xip_mem;
447 | 	unsigned long xip_pfn;
448 | 	int err;
449 | 
450 | 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
451 | 	if (vmf->pgoff >= size) {
452 | 		pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx),"
453 | 			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx), size 0x%lx\n",
454 | 			__func__, __LINE__, vma->vm_start, vma->vm_end,
455 | 			vmf->pgoff, (unsigned long)vmf->address, size);
456 | 		return VM_FAULT_SIGBUS;
457 | 	}
458 | 
459 | 	err = pmfs_get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn);
460 | 	if (unlikely(err)) {
461 | 		pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x%lx),"
462 | 			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
463 | 			__func__, __LINE__, vma->vm_start, vma->vm_end,
464 | 			vmf->pgoff, (unsigned long)vmf->address);
465 | 		return VM_FAULT_SIGBUS;
466 | 	}
467 | 
468 | 	pmfs_dbg_mmapv("[%s:%d] vm_start(0x%lx), vm_end(0x%lx), pgoff(0x%lx), "
469 | 			"BlockSz(0x%lx), VA(0x%lx)->PA(0x%lx)\n", __func__,
470 | 			__LINE__, vma->vm_start, vma->vm_end, vmf->pgoff,
471 | 			PAGE_SIZE, (unsigned long)vmf->address,
472 | 			(unsigned long)xip_pfn << PAGE_SHIFT);
473 | 
474 | 	err = vm_insert_mixed(vma, (unsigned long)vmf->address,
475 | 			pfn_to_pfn_t(xip_pfn));
476 | 
477 | 	if (err == -ENOMEM)
478 | 		return VM_FAULT_SIGBUS;
479 | 	/*
480 | 	 * err == -EBUSY is fine, we've raced against another thread
481 | 	 * that faulted-in the same page
482 | 	 */
483 | 	if (err != -EBUSY)
484 | 		BUG_ON(err);
485 | 	return VM_FAULT_NOPAGE;
486 | }
487 | 
488 | static int pmfs_xip_file_fault(struct vm_fault *vmf)
489 | {
490 | 	int ret = 0;
491 | 	timing_t fault_time;
492 | 
493 | 	PMFS_START_TIMING(mmap_fault_t, fault_time);
494 | 	rcu_read_lock();
495 | 	ret = __pmfs_xip_file_fault(vmf->vma, vmf);
496 | 	rcu_read_unlock();
497 | 	PMFS_END_TIMING(mmap_fault_t, fault_time);
498 | 	return ret;
499 | }
500 | 
501 | static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock,
502 | 				       sector_t *data_block, int create)
503 | {
504 | 	int err = -EIO;
505 | 	u64 block;
506 | 	pmfs_transaction_t *trans;
507 | 	struct pmfs_inode *pi;
508 | 
509 | 	block = pmfs_find_data_block(inode, iblock);
510 | 
511 | 	if (!block) {
512 | 		struct super_block *sb = inode->i_sb;
513 | 		if (!create) {
514 | 			err = -ENODATA;
515 | 			goto err;
516 | 		}
517 | 
518 | 		pi = pmfs_get_inode(sb, inode->i_ino);
519 | 		trans = pmfs_current_transaction();
520 | 		if (trans) {
521 | 			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
522 | 			if (err) {
523 | 				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
524 | 					__func__, __LINE__);
525 | 				goto err;
526 | 			}
527 | 		} else {
528 | 			/* 1 lentry for inode, 1 lentry for inode's b-tree */
529 | 			trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
530 | 			if (IS_ERR(trans)) {
531 | 				err = PTR_ERR(trans);
532 | 				goto err;
533 | 			}
534 | 
535 | 			rcu_read_unlock();
536 | 			inode_lock(inode);
537 | 
538 | 			pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY,
539 | 				LE_DATA);
540 | 			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
541 | 
542 | 			pmfs_commit_transaction(sb, trans);
543 | 
544 | 			inode_unlock(inode);
545 | 			rcu_read_lock();
546 | 			if (err) {
547 | 				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
548 | 					__func__, __LINE__);
549 | 				goto err;
550 | 			}
551 | 		}
552 | 		block = pmfs_find_data_block(inode, iblock);
553 | 		if (!block) {
554 | 			pmfs_dbg("[%s:%d] But alloc didn't fail!\n",
555 | 				  __func__, __LINE__);
556 | 			err = -ENODATA;
557 | 			goto err;
558 | 		}
559 | 	}
560 | 	pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock,
561 | 			 block);
562 | 
563 | 	*data_block = block;
564 | 	err = 0;
565 | 
566 | err:
567 | 	return err;
568 | }
569 | 
570 | static inline int __pmfs_get_block(struct inode *inode, pgoff_t pgoff,
571 | 				    int create, sector_t *result)
572 | {
573 | 	int rc = 0;
574 | 
575 | 	rc = pmfs_find_and_alloc_blocks(inode, (sector_t)pgoff, result,
576 | 					 create);
577 | 	return rc;
578 | }
579 | 
580 | int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
581 | 		      void **kmem, unsigned long *pfn)
582 | {
583 | 	int rc;
584 | 	sector_t block = 0;
585 | 	struct inode *inode = mapping->host;
586 | 
587 | 	rc = __pmfs_get_block(inode, pgoff, create, &block);
588 | 	if (rc) {
589 | 		pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx),"
590 | 			" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__,
591 | 			__LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr,
592 | 			block, pgoff, create, *pfn);
593 | 		return rc;
594 | 	}
595 | 
596 | 	*kmem = pmfs_get_block(inode->i_sb, block);
597 | 	*pfn = pmfs_get_pfn(inode->i_sb, block);
598 | 
599 | 	pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx),"
600 | 		" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__,
601 | 		PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn);
602 | 	return 0;
603 | }
604 | 
605 | static const struct vm_operations_struct pmfs_xip_vm_ops = {
606 | 	.fault	= pmfs_xip_file_fault,
607 | };
608 | 
609 | int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma)
610 | {
611 | //	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
612 | 
613 | 	file_accessed(file);
614 | 
615 | 	vma->vm_flags |= VM_MIXEDMAP;
616 | 
617 | 	vma->vm_ops = &pmfs_xip_vm_ops;
618 | 	pmfs_dbg_mmap4k("[%s:%d] MMAP 4KPAGE vm_start(0x%lx),"
619 | 			" vm_end(0x%lx), vm_flags(0x%lx), "
620 | 			"vm_page_prot(0x%lx)\n", __func__,
621 | 			__LINE__, vma->vm_start, vma->vm_end,
622 | 			vma->vm_flags, pgprot_val(vma->vm_page_prot));
623 | 
624 | 	return 0;
625 | }
626 | 


--------------------------------------------------------------------------------
/pmfs.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * Definitions for the PMFS filesystem.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * Copyright 2003 Sony Corporation
  9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
 10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
 11 |  * This file is licensed under the terms of the GNU General Public
 12 |  * License version 2. This program is licensed "as is" without any
 13 |  * warranty of any kind, whether express or implied.
 14 |  */
 15 | #ifndef __PMFS_H
 16 | #define __PMFS_H
 17 | 
 18 | #include <linux/crc16.h>
 19 | #include <linux/mutex.h>
 20 | #include <linux/pagemap.h>
 21 | #include <linux/rcupdate.h>
 22 | #include <linux/types.h>
 23 | #include <linux/uio.h>
 24 | #include <linux/version.h>
 25 | #include <linux/pfn_t.h>
 26 | 
 27 | #include "pmfs_def.h"
 28 | #include "journal.h"
 29 | 
 30 | #define PAGE_SHIFT_2M 21
 31 | #define PAGE_SHIFT_1G 30
 32 | 
 33 | #define PMFS_ASSERT(x)                                                 \
 34 | 	if (!(x)) {                                                     \
 35 | 		printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
 36 | 	               __FILE__, __LINE__, #x);                         \
 37 | 	}
 38 | 
 39 | /*
 40 |  * Debug code
 41 |  */
 42 | #ifdef pr_fmt
 43 | #undef pr_fmt
 44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 45 | #endif
 46 | 
 47 | /* #define pmfs_dbg(s, args...)         pr_debug(s, ## args) */
 48 | #define pmfs_dbg(s, args ...)           pr_info(s, ## args)
 49 | #define pmfs_dbg1(s, args ...)
 50 | #define pmfs_err(sb, s, args ...)       pmfs_error_mng(sb, s, ## args)
 51 | #define pmfs_warn(s, args ...)          pr_warning(s, ## args)
 52 | #define pmfs_info(s, args ...)          pr_info(s, ## args)
 53 | 
 54 | extern unsigned int pmfs_dbgmask;
 55 | #define PMFS_DBGMASK_MMAPHUGE          (0x00000001)
 56 | #define PMFS_DBGMASK_MMAP4K            (0x00000002)
 57 | #define PMFS_DBGMASK_MMAPVERBOSE       (0x00000004)
 58 | #define PMFS_DBGMASK_MMAPVVERBOSE      (0x00000008)
 59 | #define PMFS_DBGMASK_VERBOSE           (0x00000010)
 60 | #define PMFS_DBGMASK_TRANSACTION       (0x00000020)
 61 | 
 62 | #define pmfs_dbg_mmaphuge(s, args ...)		 \
 63 | 	((pmfs_dbgmask & PMFS_DBGMASK_MMAPHUGE) ? pmfs_dbg(s, args) : 0)
 64 | #define pmfs_dbg_mmap4k(s, args ...)		 \
 65 | 	((pmfs_dbgmask & PMFS_DBGMASK_MMAP4K) ? pmfs_dbg(s, args) : 0)
 66 | #define pmfs_dbg_mmapv(s, args ...)		 \
 67 | 	((pmfs_dbgmask & PMFS_DBGMASK_MMAPVERBOSE) ? pmfs_dbg(s, args) : 0)
 68 | #define pmfs_dbg_mmapvv(s, args ...)		 \
 69 | 	((pmfs_dbgmask & PMFS_DBGMASK_MMAPVVERBOSE) ? pmfs_dbg(s, args) : 0)
 70 | 
 71 | #define pmfs_dbg_verbose(s, args ...)		 \
 72 | 	((pmfs_dbgmask & PMFS_DBGMASK_VERBOSE) ? pmfs_dbg(s, ##args) : 0)
 73 | #define pmfs_dbg_trans(s, args ...)		 \
 74 | 	((pmfs_dbgmask & PMFS_DBGMASK_TRANSACTION) ? pmfs_dbg(s, ##args) : 0)
 75 | 
 76 | #define pmfs_set_bit                   __test_and_set_bit_le
 77 | #define pmfs_clear_bit                 __test_and_clear_bit_le
 78 | #define pmfs_find_next_zero_bit                find_next_zero_bit_le
 79 | 
 80 | #define clear_opt(o, opt)       (o &= ~PMFS_MOUNT_ ## opt)
 81 | #define set_opt(o, opt)         (o |= PMFS_MOUNT_ ## opt)
 82 | #define test_opt(sb, opt)       (PMFS_SB(sb)->s_mount_opt & PMFS_MOUNT_ ## opt)
 83 | 
 84 | #define PMFS_LARGE_INODE_TABLE_SIZE    (0x200000)
 85 | /* PMFS size threshold for using 2M blocks for inode table */
 86 | #define PMFS_LARGE_INODE_TABLE_THREASHOLD    (0x20000000)
 87 | /*
 88 |  * pmfs inode flags
 89 |  *
 90 |  * PMFS_EOFBLOCKS_FL	There are blocks allocated beyond eof
 91 |  */
 92 | #define PMFS_EOFBLOCKS_FL      0x20000000
 93 | /* Flags that should be inherited by new inodes from their parent. */
 94 | #define PMFS_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
 95 | 			    FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL |	\
 96 | 			    FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_JOURNAL_DATA_FL | \
 97 | 			    FS_NOTAIL_FL | FS_DIRSYNC_FL)
 98 | /* Flags that are appropriate for regular files (all but dir-specific ones). */
 99 | #define PMFS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
100 | /* Flags that are appropriate for non-directories/regular files. */
101 | #define PMFS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
102 | #define PMFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | PMFS_EOFBLOCKS_FL)
103 | 
104 | #define INODES_PER_BLOCK(bt) (1 << (blk_type_to_shift[bt] - PMFS_INODE_BITS))
105 | 
106 | extern unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX];
107 | extern unsigned int blk_type_to_size[PMFS_BLOCK_TYPE_MAX];
108 | 
109 | /* ======================= Timing ========================= */
110 | enum timing_category {
111 | 	create_t,
112 | 	unlink_t,
113 | 	readdir_t,
114 | 	xip_read_t,
115 | 	xip_write_t,
116 | 	xip_write_fast_t,
117 | 	internal_write_t,
118 | 	memcpy_r_t,
119 | 	memcpy_w_t,
120 | 	alloc_blocks_t,
121 | 	new_trans_t,
122 | 	add_log_t,
123 | 	commit_trans_t,
124 | 	mmap_fault_t,
125 | 	fsync_t,
126 | 	free_tree_t,
127 | 	evict_inode_t,
128 | 	recovery_t,
129 | 	TIMING_NUM,
130 | };
131 | 
132 | extern const char *Timingstring[TIMING_NUM];
133 | extern unsigned long long Timingstats[TIMING_NUM];
134 | extern u64 Countstats[TIMING_NUM];
135 | 
136 | extern int measure_timing;
137 | extern int support_clwb;
138 | 
139 | extern atomic64_t fsync_pages;
140 | 
141 | typedef struct timespec timing_t;
142 | 
143 | #define PMFS_START_TIMING(name, start) \
144 | 	{if (measure_timing) getrawmonotonic(&start);}
145 | 
146 | #define PMFS_END_TIMING(name, start) \
147 | 	{if (measure_timing) { \
148 | 		timing_t end; \
149 | 		getrawmonotonic(&end); \
150 | 		Timingstats[name] += \
151 | 			(end.tv_sec - start.tv_sec) * 1000000000 + \
152 | 			(end.tv_nsec - start.tv_nsec); \
153 | 	} \
154 | 	Countstats[name]++; \
155 | 	}
156 | 
157 | /* Function Prototypes */
158 | extern void pmfs_error_mng(struct super_block *sb, const char *fmt, ...);
159 | 
160 | /* file.c */
161 | extern int pmfs_mmap(struct file *file, struct vm_area_struct *vma);
162 | 
163 | /* balloc.c */
164 | int pmfs_setup_blocknode_map(struct super_block *sb);
165 | extern struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb);
166 | extern void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode);
167 | extern void pmfs_init_blockmap(struct super_block *sb,
168 | 		unsigned long init_used_size);
169 | extern void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
170 | 	unsigned short btype);
171 | extern void __pmfs_free_block(struct super_block *sb, unsigned long blocknr,
172 | 	unsigned short btype, struct pmfs_blocknode **start_hint);
173 | extern int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
174 | 	unsigned short btype, int zero);
175 | extern unsigned long pmfs_count_free_blocks(struct super_block *sb);
176 | 
177 | /* dir.c */
178 | extern int pmfs_add_entry(pmfs_transaction_t *trans,
179 | 		struct dentry *dentry, struct inode *inode);
180 | extern int pmfs_remove_entry(pmfs_transaction_t *trans,
181 | 		struct dentry *dentry, struct inode *inode);
182 | 
183 | /* namei.c */
184 | extern struct dentry *pmfs_get_parent(struct dentry *child);
185 | 
186 | /* inode.c */
187 | extern unsigned int pmfs_free_inode_subtree(struct super_block *sb,
188 | 		__le64 root, u32 height, u32 btype, unsigned long last_blocknr);
189 | extern int __pmfs_alloc_blocks(pmfs_transaction_t *trans,
190 | 		struct super_block *sb, struct pmfs_inode *pi,
191 | 		unsigned long file_blocknr, unsigned int num, bool zero);
192 | extern int pmfs_init_inode_table(struct super_block *sb);
193 | extern int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode,
194 | 		unsigned long file_blocknr, unsigned int num, bool zero);
195 | extern u64 pmfs_find_data_block(struct inode *inode,
196 | 	unsigned long file_blocknr);
197 | int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
198 | 		loff_t new_size);
199 | void pmfs_setsize(struct inode *inode, loff_t newsize);
200 | 
201 | extern struct inode *pmfs_iget(struct super_block *sb, unsigned long ino);
202 | extern void pmfs_put_inode(struct inode *inode);
203 | extern void pmfs_evict_inode(struct inode *inode);
204 | extern struct inode *pmfs_new_inode(pmfs_transaction_t *trans,
205 | 	struct inode *dir, umode_t mode, const struct qstr *qstr);
206 | extern void pmfs_update_isize(struct inode *inode, struct pmfs_inode *pi);
207 | extern void pmfs_update_nlink(struct inode *inode, struct pmfs_inode *pi);
208 | extern void pmfs_update_time(struct inode *inode, struct pmfs_inode *pi);
209 | extern int pmfs_write_inode(struct inode *inode, struct writeback_control *wbc);
210 | extern void pmfs_dirty_inode(struct inode *inode, int flags);
211 | extern int pmfs_notify_change(struct dentry *dentry, struct iattr *attr);
212 | int pmfs_getattr(const struct path *path, struct kstat *stat,
213 | 		u32 request_mask, unsigned int flags);
214 | extern void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi);
215 | extern void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi);
216 | extern unsigned long pmfs_find_region(struct inode *inode, loff_t *offset,
217 | 		int hole);
218 | extern void pmfs_truncate_del(struct inode *inode);
219 | extern void pmfs_truncate_add(struct inode *inode, u64 truncate_size);
220 | 
221 | /* ioctl.c */
222 | extern long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
223 | #ifdef CONFIG_COMPAT
224 | extern long pmfs_compat_ioctl(struct file *file, unsigned int cmd,
225 | 	unsigned long arg);
226 | #endif
227 | 
228 | /* super.c */
229 | #ifdef CONFIG_PMFS_TEST
230 | extern struct pmfs_super_block *get_pmfs_super(void);
231 | #endif
232 | extern void __pmfs_free_blocknode(struct pmfs_blocknode *bnode);
233 | extern struct super_block *pmfs_read_super(struct super_block *sb, void *data,
234 | 	int silent);
235 | extern int pmfs_statfs(struct dentry *d, struct kstatfs *buf);
236 | extern int pmfs_remount(struct super_block *sb, int *flags, char *data);
237 | 
238 | /* symlink.c */
239 | extern int pmfs_block_symlink(struct inode *inode, const char *symname,
240 | 	int len);
241 | 
242 | /* Inline functions start here */
243 | 
244 | /* Mask out flags that are inappropriate for the given type of inode. */
245 | static inline __le32 pmfs_mask_flags(umode_t mode, __le32 flags)
246 | {
247 | 	flags &= cpu_to_le32(PMFS_FL_INHERITED);
248 | 	if (S_ISDIR(mode))
249 | 		return flags;
250 | 	else if (S_ISREG(mode))
251 | 		return flags & cpu_to_le32(PMFS_REG_FLMASK);
252 | 	else
253 | 		return flags & cpu_to_le32(PMFS_OTHER_FLMASK);
254 | }
255 | 
256 | static inline int pmfs_calc_checksum(u8 *data, int n)
257 | {
258 | 	u16 crc = 0;
259 | 
260 | 	crc = crc16(~0, (__u8 *)data + sizeof(__le16), n - sizeof(__le16));
261 | 	if (*((__le16 *)data) == cpu_to_le16(crc))
262 | 		return 0;
263 | 	else
264 | 		return 1;
265 | }
266 | 
267 | struct pmfs_blocknode_lowhigh {
268 |        __le64 block_low;
269 |        __le64 block_high;
270 | };
271 |                
272 | struct pmfs_blocknode {
273 | 	struct list_head link;
274 | 	unsigned long block_low;
275 | 	unsigned long block_high;
276 | };
277 | 
278 | struct pmfs_inode_info {
279 | 	__u32   i_dir_start_lookup;
280 | 	struct list_head i_truncated;
281 | 	struct inode	vfs_inode;
282 | };
283 | 
284 | /*
285 |  * PMFS super-block data in memory
286 |  */
287 | struct pmfs_sb_info {
288 | 	/*
289 | 	 * base physical and virtual address of PMFS (which is also
290 | 	 * the pointer to the super block)
291 | 	 */
292 | 	struct block_device *s_bdev;
293 | 	phys_addr_t	phys_addr;
294 | 	void		*virt_addr;
295 | 	struct list_head block_inuse_head;
296 | 	unsigned long	block_start;
297 | 	unsigned long	block_end;
298 | 	unsigned long	num_free_blocks;
299 | 	struct mutex 	s_lock;	/* protects the SB's buffer-head */
300 | 
301 | 	/*
302 | 	 * Backing store option:
303 | 	 * 1 = no load, 2 = no store,
304 | 	 * else do both
305 | 	 */
306 | 	unsigned int	pmfs_backing_option;
307 | 
308 | 	/* Mount options */
309 | 	unsigned long	bpi;
310 | 	unsigned long	num_inodes;
311 | 	unsigned long	blocksize;
312 | 	unsigned long	initsize;
313 | 	unsigned long	s_mount_opt;
314 | 	kuid_t		uid;    /* Mount uid for root directory */
315 | 	kgid_t		gid;    /* Mount gid for root directory */
316 | 	umode_t		mode;   /* Mount mode for root directory */
317 | 	atomic_t	next_generation;
318 | 	/* inode tracking */
319 | 	struct mutex inode_table_mutex;
320 | 	unsigned int	s_inodes_count;  /* total inodes count (used or free) */
321 | 	unsigned int	s_free_inodes_count;    /* free inodes count */
322 | 	unsigned int	s_inodes_used_count;
323 | 	unsigned int	s_free_inode_hint;
324 | 
325 | 	unsigned long num_blocknode_allocated;
326 | 
327 | 	/* Journaling related structures */
328 | 	uint32_t    next_transaction_id;
329 | 	uint32_t    jsize;
330 | 	void       *journal_base_addr;
331 | 	struct mutex journal_mutex;
332 | 	struct task_struct *log_cleaner_thread;
333 | 	wait_queue_head_t  log_cleaner_wait;
334 | 	bool redo_log;
335 | 
336 | 	/* truncate list related structures */
337 | 	struct list_head s_truncate;
338 | 	struct mutex s_truncate_lock;
339 | };
340 | 
341 | static inline struct pmfs_sb_info *PMFS_SB(struct super_block *sb)
342 | {
343 | 	return sb->s_fs_info;
344 | }
345 | 
346 | static inline struct pmfs_inode_info *PMFS_I(struct inode *inode)
347 | {
348 | 	return container_of(inode, struct pmfs_inode_info, vfs_inode);
349 | }
350 | 
351 | /* If this is part of a read-modify-write of the super block,
352 |  * pmfs_memunlock_super() before calling! */
353 | static inline struct pmfs_super_block *pmfs_get_super(struct super_block *sb)
354 | {
355 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
356 | 
357 | 	return (struct pmfs_super_block *)sbi->virt_addr;
358 | }
359 | 
360 | static inline pmfs_journal_t *pmfs_get_journal(struct super_block *sb)
361 | {
362 | 	struct pmfs_super_block *ps = pmfs_get_super(sb);
363 | 
364 | 	return (pmfs_journal_t *)((char *)ps +
365 | 			le64_to_cpu(ps->s_journal_offset));
366 | }
367 | 
368 | static inline struct pmfs_inode *pmfs_get_inode_table(struct super_block *sb)
369 | {
370 | 	struct pmfs_super_block *ps = pmfs_get_super(sb);
371 | 
372 | 	return (struct pmfs_inode *)((char *)ps +
373 | 			le64_to_cpu(ps->s_inode_table_offset));
374 | }
375 | 
376 | static inline struct pmfs_super_block *pmfs_get_redund_super(struct super_block *sb)
377 | {
378 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
379 | 
380 | 	return (struct pmfs_super_block *)(sbi->virt_addr + PMFS_SB_SIZE);
381 | }
382 | 
383 | /* If this is part of a read-modify-write of the block,
384 |  * pmfs_memunlock_block() before calling! */
385 | static inline void *pmfs_get_block(struct super_block *sb, u64 block)
386 | {
387 | 	struct pmfs_super_block *ps = pmfs_get_super(sb);
388 | 
389 | 	return block ? ((void *)ps + block) : NULL;
390 | }
391 | 
392 | /* uses CPU instructions to atomically write up to 8 bytes */
393 | static inline void pmfs_memcpy_atomic (void *dst, const void *src, u8 size)
394 | {
395 | 	switch (size) {
396 | 		case 1: {
397 | 			volatile u8 *daddr = dst;
398 | 			const u8 *saddr = src;
399 | 			*daddr = *saddr;
400 | 			break;
401 | 		}
402 | 		case 2: {
403 | 			volatile __le16 *daddr = dst;
404 | 			const u16 *saddr = src;
405 | 			*daddr = cpu_to_le16(*saddr);
406 | 			break;
407 | 		}
408 | 		case 4: {
409 | 			volatile __le32 *daddr = dst;
410 | 			const u32 *saddr = src;
411 | 			*daddr = cpu_to_le32(*saddr);
412 | 			break;
413 | 		}
414 | 		case 8: {
415 | 			volatile __le64 *daddr = dst;
416 | 			const u64 *saddr = src;
417 | 			*daddr = cpu_to_le64(*saddr);
418 | 			break;
419 | 		}
420 | 		default:
421 | 			pmfs_dbg("error: memcpy_atomic called with %d bytes\n", size);
422 | 			//BUG();
423 | 	}
424 | }
425 | 
426 | static inline void pmfs_update_time_and_size(struct inode *inode,
427 | 	struct pmfs_inode *pi)
428 | {
429 | 	__le32 words[2];
430 | 	__le64 new_pi_size = cpu_to_le64(i_size_read(inode));
431 | 
432 | 	/* pi->i_size, pi->i_ctime, and pi->i_mtime need to be atomically updated.
433 |  	* So use cmpxchg16b here. */
434 | 	words[0] = cpu_to_le32(inode->i_ctime.tv_sec);
435 | 	words[1] = cpu_to_le32(inode->i_mtime.tv_sec);
436 | 	/* TODO: the following function assumes cmpxchg16b instruction writes
437 |  	* 16 bytes atomically. Confirm if it is really true. */
438 | 	cmpxchg_double_local(&pi->i_size, (u64 *)&pi->i_ctime, pi->i_size,
439 | 		*(u64 *)&pi->i_ctime, new_pi_size, *(u64 *)words);
440 | }
441 | 
442 | /* assumes the length to be 4-byte aligned */
443 | static inline void memset_nt(void *dest, uint32_t dword, size_t length)
444 | {
445 | 	uint64_t dummy1, dummy2;
446 | 	uint64_t qword = ((uint64_t)dword << 32) | dword;
447 | 
448 | 	asm volatile ("movl %%edx,%%ecx\n"
449 | 		"andl $63,%%edx\n"
450 | 		"shrl $6,%%ecx\n"
451 | 		"jz 9f\n"
452 | 		"1:      movnti %%rax,(%%rdi)\n"
453 | 		"2:      movnti %%rax,1*8(%%rdi)\n"
454 | 		"3:      movnti %%rax,2*8(%%rdi)\n"
455 | 		"4:      movnti %%rax,3*8(%%rdi)\n"
456 | 		"5:      movnti %%rax,4*8(%%rdi)\n"
457 | 		"8:      movnti %%rax,5*8(%%rdi)\n"
458 | 		"7:      movnti %%rax,6*8(%%rdi)\n"
459 | 		"8:      movnti %%rax,7*8(%%rdi)\n"
460 | 		"leaq 64(%%rdi),%%rdi\n"
461 | 		"decl %%ecx\n"
462 | 		"jnz 1b\n"
463 | 		"9:     movl %%edx,%%ecx\n"
464 | 		"andl $7,%%edx\n"
465 | 		"shrl $3,%%ecx\n"
466 | 		"jz 11f\n"
467 | 		"10:     movnti %%rax,(%%rdi)\n"
468 | 		"leaq 8(%%rdi),%%rdi\n"
469 | 		"decl %%ecx\n"
470 | 		"jnz 10b\n"
471 | 		"11:     movl %%edx,%%ecx\n"
472 | 		"shrl $2,%%ecx\n"
473 | 		"jz 12f\n"
474 | 		"movnti %%eax,(%%rdi)\n"
475 | 		"12:\n"
476 | 		: "=D"(dummy1), "=d" (dummy2) : "D" (dest), "a" (qword), "d" (length) : "memory", "rcx");
477 | }
478 | 
479 | static inline u64 __pmfs_find_data_block(struct super_block *sb,
480 | 		struct pmfs_inode *pi, unsigned long blocknr)
481 | {
482 | 	__le64 *level_ptr;
483 | 	u64 bp = 0;
484 | 	u32 height, bit_shift;
485 | 	unsigned int idx;
486 | 
487 | 	height = pi->height;
488 | 	bp = le64_to_cpu(pi->root);
489 | 
490 | 	while (height > 0) {
491 | 		level_ptr = pmfs_get_block(sb, bp);
492 | 		bit_shift = (height - 1) * META_BLK_SHIFT;
493 | 		idx = blocknr >> bit_shift;
494 | 		bp = le64_to_cpu(level_ptr[idx]);
495 | 		if (bp == 0)
496 | 			return 0;
497 | 		blocknr = blocknr & ((1 << bit_shift) - 1);
498 | 		height--;
499 | 	}
500 | 	return bp;
501 | }
502 | 
503 | static inline unsigned int pmfs_inode_blk_shift (struct pmfs_inode *pi)
504 | {
505 | 	return blk_type_to_shift[pi->i_blk_type];
506 | }
507 | 
508 | static inline uint32_t pmfs_inode_blk_size (struct pmfs_inode *pi)
509 | {
510 | 	return blk_type_to_size[pi->i_blk_type];
511 | }
512 | 
513 | /* If this is part of a read-modify-write of the inode metadata,
514 |  * pmfs_memunlock_inode() before calling! */
515 | static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb,
516 | 						  u64	ino)
517 | {
518 | 	struct pmfs_super_block *ps = pmfs_get_super(sb);
519 | 	struct pmfs_inode *inode_table = pmfs_get_inode_table(sb);
520 | 	u64 bp, block, ino_offset;
521 | 
522 | 	if (ino == 0)
523 | 		return NULL;
524 | 
525 | 	block = ino >> pmfs_inode_blk_shift(inode_table);
526 | 	bp = __pmfs_find_data_block(sb, inode_table, block);
527 | 
528 | 	if (bp == 0)
529 | 		return NULL;
530 | 	ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1));
531 | 	return (struct pmfs_inode *)((void *)ps + bp + ino_offset);
532 | }
533 | 
534 | static inline u64
535 | pmfs_get_addr_off(struct pmfs_sb_info *sbi, void *addr)
536 | {
537 | 	PMFS_ASSERT((addr >= sbi->virt_addr) &&
538 | 			(addr < (sbi->virt_addr + sbi->initsize)));
539 | 	return (u64)(addr - sbi->virt_addr);
540 | }
541 | 
542 | static inline u64
543 | pmfs_get_block_off(struct super_block *sb, unsigned long blocknr,
544 | 		    unsigned short btype)
545 | {
546 | 	return (u64)blocknr << PAGE_SHIFT;
547 | }
548 | 
549 | static inline unsigned long
550 | pmfs_get_numblocks(unsigned short btype)
551 | {
552 | 	unsigned long num_blocks;
553 | 
554 | 	if (btype == PMFS_BLOCK_TYPE_4K) {
555 | 		num_blocks = 1;
556 | 	} else if (btype == PMFS_BLOCK_TYPE_2M) {
557 | 		num_blocks = 512;
558 | 	} else {
559 | 		//btype == PMFS_BLOCK_TYPE_1G 
560 | 		num_blocks = 0x40000;
561 | 	}
562 | 	return num_blocks;
563 | }
564 | 
565 | static inline unsigned long
566 | pmfs_get_blocknr(struct super_block *sb, u64 block, unsigned short btype)
567 | {
568 | 	return block >> PAGE_SHIFT;
569 | }
570 | 
571 | static inline unsigned long pmfs_get_pfn(struct super_block *sb, u64 block)
572 | {
573 | 	return (PMFS_SB(sb)->phys_addr + block) >> PAGE_SHIFT;
574 | }
575 | 
576 | static inline int pmfs_is_mounting(struct super_block *sb)
577 | {
578 | 	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
579 | 	return sbi->s_mount_opt & PMFS_MOUNT_MOUNTING;
580 | }
581 | 
582 | static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_item (struct 
583 | 		super_block *sb, u64 ino)
584 | {
585 | 	struct pmfs_inode *pi = pmfs_get_inode(sb, ino);
586 | 	return (struct pmfs_inode_truncate_item *)(pi + 1);
587 | }
588 | 
589 | static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_list_head (
590 | 		struct super_block *sb)
591 | {
592 | 	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
593 | 	return (struct pmfs_inode_truncate_item *)(pi + 1);
594 | }
595 | 
596 | static inline void check_eof_blocks(struct super_block *sb, 
597 | 		struct pmfs_inode *pi, loff_t size)
598 | {
599 | 	if ((pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) &&
600 | 		(size + sb->s_blocksize) > (le64_to_cpu(pi->i_blocks)
601 | 			<< sb->s_blocksize_bits))
602 | 		pi->i_flags &= cpu_to_le32(~PMFS_EOFBLOCKS_FL);
603 | }
604 | 
605 | #include "wprotect.h"
606 | 
607 | /*
608 |  * Inodes and files operations
609 |  */
610 | 
611 | /* dir.c */
612 | extern const struct file_operations pmfs_dir_operations;
613 | 
614 | /* file.c */
615 | extern const struct inode_operations pmfs_file_inode_operations;
616 | extern const struct file_operations pmfs_xip_file_operations;
617 | int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
618 | 
619 | /* inode.c */
620 | extern const struct address_space_operations pmfs_aops_xip;
621 | 
622 | /* bbuild.c */
623 | void pmfs_save_blocknode_mappings(struct super_block *sb);
624 | 
625 | /* namei.c */
626 | extern const struct inode_operations pmfs_dir_inode_operations;
627 | extern const struct inode_operations pmfs_special_inode_operations;
628 | 
629 | /* symlink.c */
630 | extern const struct inode_operations pmfs_symlink_inode_operations;
631 | 
632 | int pmfs_check_integrity(struct super_block *sb,
633 | 	struct pmfs_super_block *super);
634 | void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr,
635 | 	ssize_t size);
636 | 
637 | int pmfs_check_dir_entry(const char *function, struct inode *dir,
638 | 			  struct pmfs_direntry *de, u8 *base,
639 | 			  unsigned long offset);
640 | 
641 | static inline int pmfs_match(int len, const char *const name,
642 | 			      struct pmfs_direntry *de)
643 | {
644 | 	if (len == de->name_len && de->ino && !memcmp(de->name, name, len))
645 | 		return 1;
646 | 	return 0;
647 | }
648 | 
649 | int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
650 | 			  unsigned long offset,
651 | 			  struct pmfs_direntry **res_dir,
652 | 			  struct pmfs_direntry **prev_dir);
653 | 
654 | /* pmfs_stats.c */
655 | #define	PMFS_PRINT_TIMING	0xBCD00010
656 | #define	PMFS_CLEAR_STATS	0xBCD00011
657 | void pmfs_print_timing_stats(void);
658 | void pmfs_clear_stats(void);
659 | 
660 | #endif /* __PMFS_H */
661 | 


--------------------------------------------------------------------------------
/namei.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BRIEF DESCRIPTION
  3 |  *
  4 |  * Inode operations for directories.
  5 |  *
  6 |  * Copyright 2012-2013 Intel Corporation
  7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
  8 |  * Copyright 2003 Sony Corporation
  9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
 10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
 11 |  * This file is licensed under the terms of the GNU General Public
 12 |  * License version 2. This program is licensed "as is" without any
 13 |  * warranty of any kind, whether express or implied.
 14 |  */
 15 | #include <linux/fs.h>
 16 | #include <linux/pagemap.h>
 17 | #include "pmfs.h"
 18 | #include "xip.h"
 19 | 
 20 | /*
 21 |  * Couple of helper functions - make the code slightly cleaner.
 22 |  */
 23 | static inline void pmfs_inc_count(struct inode *inode, struct pmfs_inode *pi)
 24 | {
 25 | 	inc_nlink(inode);
 26 | 	pmfs_update_nlink(inode, pi);
 27 | }
 28 | 
 29 | static inline void pmfs_dec_count(struct inode *inode, struct pmfs_inode *pi)
 30 | {
 31 | 	if (inode->i_nlink) {
 32 | 		drop_nlink(inode);
 33 | 		pmfs_update_nlink(inode, pi);
 34 | 	}
 35 | }
 36 | 
 37 | static inline int pmfs_add_nondir(pmfs_transaction_t *trans,
 38 | 		struct inode *dir, struct dentry *dentry, struct inode *inode)
 39 | {
 40 | 	struct pmfs_inode *pi;
 41 | 	int err = pmfs_add_entry(trans, dentry, inode);
 42 | 
 43 | 	if (!err) {
 44 | 		d_instantiate(dentry, inode);
 45 | 		unlock_new_inode(inode);
 46 | 		return 0;
 47 | 	}
 48 | 	pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
 49 | 	pmfs_dec_count(inode, pi);
 50 | 	unlock_new_inode(inode);
 51 | 	iput(inode);
 52 | 	return err;
 53 | }
 54 | 
 55 | static inline struct pmfs_direntry *pmfs_next_entry(struct pmfs_direntry *p)
 56 | {
 57 | 	return (struct pmfs_direntry *)((char *)p + le16_to_cpu(p->de_len));
 58 | }
 59 | 
 60 | /*
 61 |  * Methods themselves.
 62 |  */
 63 | int pmfs_check_dir_entry(const char *function, struct inode *dir,
 64 | 			  struct pmfs_direntry *de, u8 *base,
 65 | 			  unsigned long offset)
 66 | {
 67 | 	const char *error_msg = NULL;
 68 | 	const int rlen = le16_to_cpu(de->de_len);
 69 | 
 70 | 	if (unlikely(rlen < PMFS_DIR_REC_LEN(1)))
 71 | 		error_msg = "de_len is smaller than minimal";
 72 | 	else if (unlikely(rlen % 4 != 0))
 73 | 		error_msg = "de_len % 4 != 0";
 74 | 	else if (unlikely(rlen < PMFS_DIR_REC_LEN(de->name_len)))
 75 | 		error_msg = "de_len is too small for name_len";
 76 | 	else if (unlikely((((u8 *)de - base) + rlen > dir->i_sb->s_blocksize)))
 77 | 		error_msg = "directory entry across blocks";
 78 | 
 79 | 	if (unlikely(error_msg != NULL)) {
 80 | 		pmfs_dbg("bad entry in directory #%lu: %s - "
 81 | 			  "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 82 | 			  dir->i_ino, error_msg, offset,
 83 | 			  (unsigned long)le64_to_cpu(de->ino), rlen,
 84 | 			  de->name_len);
 85 | 	}
 86 | 
 87 | 	return error_msg == NULL ? 1 : 0;
 88 | }
 89 | 
 90 | /*
 91 |  * Returns 0 if not found, -1 on failure, and 1 on success
 92 |  */
 93 | int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
 94 | 			  unsigned long	offset,
 95 | 			  struct pmfs_direntry **res_dir,
 96 | 			  struct pmfs_direntry **prev_dir)
 97 | {
 98 | 	struct pmfs_direntry *de;
 99 | 	struct pmfs_direntry *pde = NULL;
100 | 	char *dlimit;
101 | 	int de_len;
102 | 	const char *name = child->name;
103 | 	int namelen = child->len;
104 | 
105 | 	de = (struct pmfs_direntry *)blk_base;
106 | 	dlimit = blk_base + dir->i_sb->s_blocksize;
107 | 	while ((char *)de < dlimit) {
108 | 		/* this code is executed quadratically often */
109 | 		/* do minimal checking `by hand' */
110 | 
111 | 		if ((char *)de + namelen <= dlimit &&
112 | 		    pmfs_match(namelen, name, de)) {
113 | 			/* found a match - just to be sure, do a full check */
114 | 			if (!pmfs_check_dir_entry("pmfs_inode_by_name",
115 | 						   dir, de, blk_base, offset))
116 | 				return -1;
117 | 			*res_dir = de;
118 | 			if (prev_dir)
119 | 				*prev_dir = pde;
120 | 			return 1;
121 | 		}
122 | 		/* prevent looping on a bad block */
123 | 		de_len = le16_to_cpu(de->de_len);
124 | 		if (de_len <= 0)
125 | 			return -1;
126 | 		offset += de_len;
127 | 		pde = de;
128 | 		de = (struct pmfs_direntry *)((char *)de + de_len);
129 | 	}
130 | 	return 0;
131 | }
132 | 
133 | static ino_t pmfs_inode_by_name(struct inode *dir, struct qstr *entry,
134 | 				 struct pmfs_direntry **res_entry)
135 | {
136 | 	struct pmfs_inode *pi;
137 | 	ino_t i_no = 0;
138 | 	int namelen, nblocks, i;
139 | 	u8 *blk_base;
140 | 	const u8 *name = entry->name;
141 | 	struct super_block *sb = dir->i_sb;
142 | 	unsigned long block, start;
143 | 	struct pmfs_inode_info *si = PMFS_I(dir);
144 | 
145 | 	pi = pmfs_get_inode(sb, dir->i_ino);
146 | 
147 | 	namelen = entry->len;
148 | 	if (namelen > PMFS_NAME_LEN)
149 | 		return 0;
150 | 	if ((namelen <= 2) && (name[0] == '.') &&
151 | 	    (name[1] == '.' || name[1] == 0)) {
152 | 		/*
153 | 		 * "." or ".." will only be in the first block
154 | 		 */
155 | 		block = start = 0;
156 | 		nblocks = 1;
157 | 		goto restart;
158 | 	}
159 | 	nblocks = dir->i_size >> dir->i_sb->s_blocksize_bits;
160 | 	start = si->i_dir_start_lookup;
161 | 	if (start >= nblocks)
162 | 		start = 0;
163 | 	block = start;
164 | restart:
165 | 	do {
166 | 		blk_base =
167 | 			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
168 | 		if (!blk_base)
169 | 			goto done;
170 | 		i = pmfs_search_dirblock(blk_base, dir, entry,
171 | 					  block << sb->s_blocksize_bits,
172 | 					  res_entry, NULL);
173 | 		if (i == 1) {
174 | 			si->i_dir_start_lookup = block;
175 | 			i_no = le64_to_cpu((*res_entry)->ino);
176 | 			goto done;
177 | 		} else {
178 | 			if (i < 0)
179 | 				goto done;
180 | 		}
181 | 		if (++block >= nblocks)
182 | 			block = 0;
183 | 	} while (block != start);
184 | 	/*
185 | 	 * If the directory has grown while we were searching, then
186 | 	 * search the last part of the directory before giving up.
187 | 	 */
188 | 	block = nblocks;
189 | 	nblocks = dir->i_size >> sb->s_blocksize_bits;
190 | 	if (block < nblocks) {
191 | 		start = 0;
192 | 		goto restart;
193 | 	}
194 | done:
195 | 	return i_no;
196 | }
197 | 
198 | static struct dentry *pmfs_lookup(struct inode *dir, struct dentry *dentry,
199 | 				   unsigned int flags)
200 | {
201 | 	struct inode *inode = NULL;
202 | 	struct pmfs_direntry *de;
203 | 	ino_t ino;
204 | 
205 | 	if (dentry->d_name.len > PMFS_NAME_LEN)
206 | 		return ERR_PTR(-ENAMETOOLONG);
207 | 
208 | 	ino = pmfs_inode_by_name(dir, &dentry->d_name, &de);
209 | 	if (ino) {
210 | 		inode = pmfs_iget(dir->i_sb, ino);
211 | 		if (inode == ERR_PTR(-ESTALE)) {
212 | 			pmfs_err(dir->i_sb, __func__,
213 | 				  "deleted inode referenced: %lu",
214 | 				  (unsigned long)ino);
215 | 			return ERR_PTR(-EIO);
216 | 		}
217 | 	}
218 | 
219 | 	return d_splice_alias(inode, dentry);
220 | }
221 | 
222 | /*
223 |  * By the time this is called, we already have created
224 |  * the directory cache entry for the new file, but it
225 |  * is so far negative - it has no inode.
226 |  *
227 |  * If the create succeeds, we fill in the inode information
228 |  * with d_instantiate().
229 |  */
230 | static int pmfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
231 | 			bool excl)
232 | {
233 | 	struct inode *inode = NULL;
234 | 	int err = PTR_ERR(inode);
235 | 	struct super_block *sb = dir->i_sb;
236 | 	pmfs_transaction_t *trans;
237 | 	timing_t create_time;
238 | 
239 | 	PMFS_START_TIMING(create_t, create_time);
240 | 	/* two log entries for new inode, 1 lentry for dir inode, 1 for dir
241 | 	 * inode's b-tree, 2 lentries for logging dir entry
242 | 	 */
243 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
244 | 		MAX_DIRENTRY_LENTRIES);
245 | 	if (IS_ERR(trans)) {
246 | 		err = PTR_ERR(trans);
247 | 		goto out;
248 | 	}
249 | 
250 | 	inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
251 | 	if (IS_ERR(inode))
252 | 		goto out_err;
253 | 	pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__,
254 | 				dentry->d_name.name, inode->i_ino);
255 | 	inode->i_op = &pmfs_file_inode_operations;
256 | 	inode->i_mapping->a_ops = &pmfs_aops_xip;
257 | 	inode->i_fop = &pmfs_xip_file_operations;
258 | 	err = pmfs_add_nondir(trans, dir, dentry, inode);
259 | 	if (err)
260 | 		goto out_err;
261 | 	pmfs_commit_transaction(sb, trans);
262 | out:
263 | 	PMFS_END_TIMING(create_t, create_time);
264 | 	return err;
265 | out_err:
266 | 	pmfs_abort_transaction(sb, trans);
267 | 	return err;
268 | }
269 | 
270 | static int pmfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
271 | 		       dev_t rdev)
272 | {
273 | 	struct inode *inode = NULL;
274 | 	int err = PTR_ERR(inode);
275 | 	pmfs_transaction_t *trans;
276 | 	struct super_block *sb = dir->i_sb;
277 | 	struct pmfs_inode *pi;
278 | 
279 | 	/* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
280 | 	 * inode's b-tree, 2 lentries for logging dir entry
281 | 	 */
282 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
283 | 			MAX_DIRENTRY_LENTRIES);
284 | 	if (IS_ERR(trans)) {
285 | 		err = PTR_ERR(trans);
286 | 		goto out;
287 | 	}
288 | 
289 | 	inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
290 | 	if (IS_ERR(inode))
291 | 		goto out_err;
292 | 	init_special_inode(inode, mode, rdev);
293 | 	inode->i_op = &pmfs_special_inode_operations;
294 | 
295 | 	pi = pmfs_get_inode(sb, inode->i_ino);
296 | 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
297 | 		pi->dev.rdev = cpu_to_le32(inode->i_rdev);
298 | 	err = pmfs_add_nondir(trans, dir, dentry, inode);
299 | 	if (err)
300 | 		goto out_err;
301 | 	pmfs_commit_transaction(sb, trans);
302 | out:
303 | 	return err;
304 | out_err:
305 | 	pmfs_abort_transaction(sb, trans);
306 | 	return err;
307 | }
308 | 
309 | static int pmfs_symlink(struct inode *dir, struct dentry *dentry,
310 | 			 const char *symname)
311 | {
312 | 	struct super_block *sb = dir->i_sb;
313 | 	int err = -ENAMETOOLONG;
314 | 	unsigned len = strlen(symname);
315 | 	struct inode *inode;
316 | 	pmfs_transaction_t *trans;
317 | 	struct pmfs_inode *pi;
318 | 
319 | 	if (len + 1 > sb->s_blocksize)
320 | 		goto out;
321 | 
322 | 	/* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
323 | 	 * inode's b-tree, 2 lentries for logging dir entry
324 | 	 */
325 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
326 | 			MAX_DIRENTRY_LENTRIES);
327 | 	if (IS_ERR(trans)) {
328 | 		err = PTR_ERR(trans);
329 | 		goto out;
330 | 	}
331 | 
332 | 	inode = pmfs_new_inode(trans, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
333 | 	err = PTR_ERR(inode);
334 | 	if (IS_ERR(inode)) {
335 | 		pmfs_abort_transaction(sb, trans);
336 | 		goto out;
337 | 	}
338 | 
339 | 	inode->i_op = &pmfs_symlink_inode_operations;
340 | 	inode->i_mapping->a_ops = &pmfs_aops_xip;
341 | 
342 | 	pi = pmfs_get_inode(sb, inode->i_ino);
343 | 	err = pmfs_block_symlink(inode, symname, len);
344 | 	if (err)
345 | 		goto out_fail;
346 | 
347 | 	inode->i_size = len;
348 | 	pmfs_update_isize(inode, pi);
349 | 
350 | 	err = pmfs_add_nondir(trans, dir, dentry, inode);
351 | 	if (err) {
352 | 		pmfs_abort_transaction(sb, trans);
353 | 		goto out;
354 | 	}
355 | 
356 | 	pmfs_commit_transaction(sb, trans);
357 | out:
358 | 	return err;
359 | 
360 | out_fail:
361 | 	pmfs_dec_count(inode, pi);
362 | 	unlock_new_inode(inode);
363 | 	iput(inode);
364 | 	pmfs_abort_transaction(sb, trans);
365 | 	goto out;
366 | }
367 | 
368 | static int pmfs_link(struct dentry *dest_dentry, struct inode *dir,
369 | 		      struct dentry *dentry)
370 | {
371 | 	struct inode *inode = dest_dentry->d_inode;
372 | 	int err = -ENOMEM;
373 | 	pmfs_transaction_t *trans;
374 | 	struct super_block *sb = inode->i_sb;
375 | 	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
376 | 
377 | 	if (inode->i_nlink >= PMFS_LINK_MAX)
378 | 		return -EMLINK;
379 | 
380 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
381 | 			MAX_DIRENTRY_LENTRIES);
382 | 	if (IS_ERR(trans)) {
383 | 		err = PTR_ERR(trans);
384 | 		goto out;
385 | 	}
386 | 	/* only need to log the first 48 bytes since we only modify ctime and
387 | 	 * i_links_count in this system call */
388 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
389 | 
390 | 	ihold(inode);
391 | 
392 | 	err = pmfs_add_entry(trans, dentry, inode);
393 | 	if (!err) {
394 | 		inode->i_ctime = current_time(inode);
395 | 		inc_nlink(inode);
396 | 
397 | 		pmfs_memunlock_inode(sb, pi);
398 | 		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
399 | 		pi->i_links_count = cpu_to_le16(inode->i_nlink);
400 | 		pmfs_memlock_inode(sb, pi);
401 | 
402 | 		d_instantiate(dentry, inode);
403 | 		pmfs_commit_transaction(sb, trans);
404 | 	} else {
405 | 		iput(inode);
406 | 		pmfs_abort_transaction(sb, trans);
407 | 	}
408 | out:
409 | 	return err;
410 | }
411 | 
412 | static int pmfs_unlink(struct inode *dir, struct dentry *dentry)
413 | {
414 | 	struct inode *inode = dentry->d_inode;
415 | 	int retval = -ENOMEM;
416 | 	pmfs_transaction_t *trans;
417 | 	struct super_block *sb = inode->i_sb;
418 | 	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
419 | 	timing_t unlink_time;
420 | 
421 | 	PMFS_START_TIMING(unlink_t, unlink_time);
422 | 
423 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
424 | 		MAX_DIRENTRY_LENTRIES);
425 | 	if (IS_ERR(trans)) {
426 | 		retval = PTR_ERR(trans);
427 | 		goto out;
428 | 	}
429 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
430 | 
431 | 	pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__,
432 | 				dentry->d_name.name, inode->i_ino);
433 | 	retval = pmfs_remove_entry(trans, dentry, inode);
434 | 	if (retval)
435 | 		goto end_unlink;
436 | 
437 | 	if (inode->i_nlink == 1)
438 | 		pmfs_truncate_add(inode, inode->i_size);
439 | 	inode->i_ctime = dir->i_ctime;
440 | 
441 | 	pmfs_memunlock_inode(sb, pi);
442 | 	if (inode->i_nlink) {
443 | 		drop_nlink(inode);
444 | 		pi->i_links_count = cpu_to_le16(inode->i_nlink);
445 | 	}
446 | 	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
447 | 	pmfs_memlock_inode(sb, pi);
448 | 
449 | 	pmfs_commit_transaction(sb, trans);
450 | 	PMFS_END_TIMING(unlink_t, unlink_time);
451 | 	return 0;
452 | end_unlink:
453 | 	pmfs_abort_transaction(sb, trans);
454 | out:
455 | 	return retval;
456 | }
457 | 
458 | static int pmfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
459 | {
460 | 	struct inode *inode;
461 | 	struct pmfs_inode *pi, *pidir;
462 | 	struct pmfs_direntry *de = NULL;
463 | 	struct super_block *sb = dir->i_sb;
464 | 	pmfs_transaction_t *trans;
465 | 	int err = -EMLINK;
466 | 	char *blk_base;
467 | 
468 | 	if (dir->i_nlink >= PMFS_LINK_MAX)
469 | 		goto out;
470 | 
471 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
472 | 			MAX_DIRENTRY_LENTRIES);
473 | 	if (IS_ERR(trans)) {
474 | 		err = PTR_ERR(trans);
475 | 		goto out;
476 | 	}
477 | 
478 | 	inode = pmfs_new_inode(trans, dir, S_IFDIR | mode, &dentry->d_name);
479 | 	err = PTR_ERR(inode);
480 | 	if (IS_ERR(inode)) {
481 | 		pmfs_abort_transaction(sb, trans);
482 | 		goto out;
483 | 	}
484 | 
485 | 	pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__,
486 | 				dentry->d_name.name, inode->i_ino);
487 | 	inode->i_op = &pmfs_dir_inode_operations;
488 | 	inode->i_fop = &pmfs_dir_operations;
489 | 	inode->i_mapping->a_ops = &pmfs_aops_xip;
490 | 
491 | 	/* since this is a new inode so we don't need to include this
492 | 	 * pmfs_alloc_blocks in the transaction
493 | 	 */
494 | 	err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
495 | 	if (err)
496 | 		goto out_clear_inode;
497 | 	inode->i_size = sb->s_blocksize;
498 | 
499 | 	blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
500 | 	de = (struct pmfs_direntry *)blk_base;
501 | 	pmfs_memunlock_range(sb, blk_base, sb->s_blocksize);
502 | 	de->ino = cpu_to_le64(inode->i_ino);
503 | 	de->name_len = 1;
504 | 	de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
505 | 	strcpy(de->name, ".");
506 | 	/*de->file_type = S_IFDIR; */
507 | 	de = pmfs_next_entry(de);
508 | 	de->ino = cpu_to_le64(dir->i_ino);
509 | 	de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
510 | 	de->name_len = 2;
511 | 	strcpy(de->name, "..");
512 | 	/*de->file_type =  S_IFDIR; */
513 | 	pmfs_memlock_range(sb, blk_base, sb->s_blocksize);
514 | 
515 | 	/* No need to journal the dir entries but we need to persist them */
516 | 	pmfs_flush_buffer(blk_base, PMFS_DIR_REC_LEN(1) +
517 | 			PMFS_DIR_REC_LEN(2), true);
518 | 
519 | 	set_nlink(inode, 2);
520 | 
521 | 	err = pmfs_add_entry(trans, dentry, inode);
522 | 	if (err) {
523 | 		pmfs_dbg_verbose("failed to add dir entry\n");
524 | 		goto out_clear_inode;
525 | 	}
526 | 	pi = pmfs_get_inode(sb, inode->i_ino);
527 | 	pmfs_memunlock_inode(sb, pi);
528 | 	pi->i_links_count = cpu_to_le16(inode->i_nlink);
529 | 	pi->i_size = cpu_to_le64(inode->i_size);
530 | 	pmfs_memlock_inode(sb, pi);
531 | 
532 | 	pidir = pmfs_get_inode(sb, dir->i_ino);
533 | 	pmfs_inc_count(dir, pidir);
534 | 	d_instantiate(dentry, inode);
535 | 	unlock_new_inode(inode);
536 | 
537 | 	pmfs_commit_transaction(sb, trans);
538 | 
539 | out:
540 | 	return err;
541 | 
542 | out_clear_inode:
543 | 	clear_nlink(inode);
544 | 	unlock_new_inode(inode);
545 | 	iput(inode);
546 | 	pmfs_abort_transaction(sb, trans);
547 | 	goto out;
548 | }
549 | 
550 | /*
551 |  * routine to check that the specified directory is empty (for rmdir)
552 |  */
553 | static int pmfs_empty_dir(struct inode *inode)
554 | {
555 | 	unsigned long offset;
556 | 	struct pmfs_direntry *de, *de1;
557 | 	struct super_block *sb;
558 | 	char *blk_base;
559 | 	int err = 0;
560 | 
561 | 	sb = inode->i_sb;
562 | 	if (inode->i_size < PMFS_DIR_REC_LEN(1) + PMFS_DIR_REC_LEN(2)) {
563 | 		pmfs_dbg("bad directory (dir #%lu)-no data block",
564 | 			  inode->i_ino);
565 | 		return 1;
566 | 	}
567 | 
568 | 	blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
569 | 	if (!blk_base) {
570 | 		pmfs_dbg("bad directory (dir #%lu)-no data block",
571 | 			  inode->i_ino);
572 | 		return 1;
573 | 	}
574 | 
575 | 	de = (struct pmfs_direntry *)blk_base;
576 | 	de1 = pmfs_next_entry(de);
577 | 
578 | 	if (le64_to_cpu(de->ino) != inode->i_ino || !le64_to_cpu(de1->ino) ||
579 | 	    strcmp(".", de->name) || strcmp("..", de1->name)) {
580 | 		pmfs_dbg("bad directory (dir #%lu) - no `.' or `..'",
581 | 			  inode->i_ino);
582 | 		return 1;
583 | 	}
584 | 	offset = le16_to_cpu(de->de_len) + le16_to_cpu(de1->de_len);
585 | 	de = pmfs_next_entry(de1);
586 | 	while (offset < inode->i_size) {
587 | 		if (!blk_base || (void *)de >= (void *)(blk_base +
588 | 					sb->s_blocksize)) {
589 | 			err = 0;
590 | 			blk_base = pmfs_get_block(sb, pmfs_find_data_block(
591 | 				    inode, offset >> sb->s_blocksize_bits));
592 | 			if (!blk_base) {
593 | 				pmfs_dbg("Error: reading dir #%lu offset %lu\n",
594 | 					  inode->i_ino, offset);
595 | 				offset += sb->s_blocksize;
596 | 				continue;
597 | 			}
598 | 			de = (struct pmfs_direntry *)blk_base;
599 | 		}
600 | 		if (!pmfs_check_dir_entry("empty_dir", inode, de, blk_base,
601 | 					offset)) {
602 | 			de = (struct pmfs_direntry *)(blk_base +
603 | 				sb->s_blocksize);
604 | 			offset = (offset | (sb->s_blocksize - 1)) + 1;
605 | 			continue;
606 | 		}
607 | 		if (le64_to_cpu(de->ino))
608 | 			return 0;
609 | 		offset += le16_to_cpu(de->de_len);
610 | 		de = pmfs_next_entry(de);
611 | 	}
612 | 	return 1;
613 | }
614 | 
615 | static int pmfs_rmdir(struct inode *dir, struct dentry *dentry)
616 | {
617 | 	struct inode *inode = dentry->d_inode;
618 | 	struct pmfs_direntry *de;
619 | 	pmfs_transaction_t *trans;
620 | 	struct super_block *sb = inode->i_sb;
621 | 	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino), *pidir;
622 | 	int err = -ENOTEMPTY;
623 | 
624 | 	if (!inode)
625 | 		return -ENOENT;
626 | 
627 | 	pmfs_dbg_verbose("%s: %s, ino %lu\n", __func__,
628 | 				dentry->d_name.name, inode->i_ino);
629 | 	if (pmfs_inode_by_name(dir, &dentry->d_name, &de) == 0)
630 | 		return -ENOENT;
631 | 
632 | 	if (!pmfs_empty_dir(inode))
633 | 		return err;
634 | 
635 | 	if (inode->i_nlink != 2)
636 | 		pmfs_dbg("empty directory has nlink!=2 (%d)", inode->i_nlink);
637 | 
638 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
639 | 			MAX_DIRENTRY_LENTRIES);
640 | 	if (IS_ERR(trans)) {
641 | 		err = PTR_ERR(trans);
642 | 		return err;
643 | 	}
644 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
645 | 
646 | 	err = pmfs_remove_entry(trans, dentry, inode);
647 | 	if (err)
648 | 		goto end_rmdir;
649 | 
650 | 	/*inode->i_version++; */
651 | 	clear_nlink(inode);
652 | 	inode->i_ctime = dir->i_ctime;
653 | 
654 | 	pmfs_memunlock_inode(sb, pi);
655 | 	pi->i_links_count = cpu_to_le16(inode->i_nlink);
656 | 	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
657 | 	pmfs_memlock_inode(sb, pi);
658 | 
659 | 	/* add the inode to truncate list in case a crash happens before the
660 | 	 * subsequent evict_inode is called. It will be deleted from the
661 | 	 * truncate list during evict_inode.
662 | 	 */
663 | 	pmfs_truncate_add(inode, inode->i_size);
664 | 
665 | 	pidir = pmfs_get_inode(sb, dir->i_ino);
666 | 	pmfs_dec_count(dir, pidir);
667 | 
668 | 	pmfs_commit_transaction(sb, trans);
669 | 	return err;
670 | end_rmdir:
671 | 	pmfs_abort_transaction(sb, trans);
672 | 	return err;
673 | }
674 | 
675 | static int pmfs_rename(struct inode *old_dir,
676 | 			struct dentry *old_dentry,
677 | 			struct inode *new_dir, struct dentry *new_dentry,
678 | 			unsigned int flags)
679 | {
680 | 	struct inode *old_inode = old_dentry->d_inode;
681 | 	struct inode *new_inode = new_dentry->d_inode;
682 | 	struct pmfs_direntry *new_de = NULL, *old_de = NULL;
683 | 	pmfs_transaction_t *trans;
684 | 	struct super_block *sb = old_inode->i_sb;
685 | 	struct pmfs_inode *pi, *new_pidir, *old_pidir;
686 | 	int err = -ENOENT;
687 | 
688 | 	pmfs_inode_by_name(new_dir, &new_dentry->d_name, &new_de);
689 | 	pmfs_inode_by_name(old_dir, &old_dentry->d_name, &old_de);
690 | 
691 | 	pmfs_dbg_verbose("%s: rename %s to %s\n", __func__,
692 | 			old_dentry->d_name.name, new_dentry->d_name.name);
693 | 	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 4 +
694 | 			MAX_DIRENTRY_LENTRIES * 2);
695 | 	if (IS_ERR(trans)) {
696 | 		return PTR_ERR(trans);
697 | 	}
698 | 
699 | 	if (new_inode) {
700 | 		err = -ENOTEMPTY;
701 | 		if (S_ISDIR(old_inode->i_mode) && !pmfs_empty_dir(new_inode))
702 | 			goto out;
703 | 	} else {
704 | 		if (S_ISDIR(old_inode->i_mode)) {
705 | 			err = -EMLINK;
706 | 			if (new_dir->i_nlink >= PMFS_LINK_MAX)
707 | 				goto out;
708 | 		}
709 | 	}
710 | 
711 | 	new_pidir = pmfs_get_inode(sb, new_dir->i_ino);
712 | 
713 | 	pi = pmfs_get_inode(sb, old_inode->i_ino);
714 | 	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
715 | 
716 | 	if (!new_de) {
717 | 		/* link it into the new directory. */
718 | 		err = pmfs_add_entry(trans, new_dentry, old_inode);
719 | 		if (err)
720 | 			goto out;
721 | 	} else {
722 | 		pmfs_add_logentry(sb, trans, &new_de->ino, sizeof(new_de->ino),
723 | 			LE_DATA);
724 | 
725 | 		pmfs_memunlock_range(sb, new_de, sb->s_blocksize);
726 | 		new_de->ino = cpu_to_le64(old_inode->i_ino);
727 | 		/*new_de->file_type = old_de->file_type; */
728 | 		pmfs_memlock_range(sb, new_de, sb->s_blocksize);
729 | 
730 | 		pmfs_add_logentry(sb, trans, new_pidir, MAX_DATA_PER_LENTRY,
731 | 			LE_DATA);
732 | 		/*new_dir->i_version++; */
733 | 		new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
734 | 		pmfs_update_time(new_dir, new_pidir);
735 | 	}
736 | 
737 | 	/* and unlink the inode from the old directory ... */
738 | 	err = pmfs_remove_entry(trans, old_dentry, old_inode);
739 | 	if (err)
740 | 		goto out;
741 | 
742 | 	if (new_inode) {
743 | 		pi = pmfs_get_inode(sb, new_inode->i_ino);
744 | 		pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
745 | 		new_inode->i_ctime = current_time(new_inode);
746 | 
747 | 		pmfs_memunlock_inode(sb, pi);
748 | 		if (S_ISDIR(old_inode->i_mode)) {
749 | 			if (new_inode->i_nlink)
750 | 				drop_nlink(new_inode);
751 | 		}
752 | 		pi->i_ctime = cpu_to_le32(new_inode->i_ctime.tv_sec);
753 | 		if (new_inode->i_nlink)
754 | 			drop_nlink(new_inode);
755 | 		pi->i_links_count = cpu_to_le16(new_inode->i_nlink);
756 | 		pmfs_memlock_inode(sb, pi);
757 | 
758 | 		if (!new_inode->i_nlink)
759 | 			pmfs_truncate_add(new_inode, new_inode->i_size);
760 | 	} else {
761 | 		if (S_ISDIR(old_inode->i_mode)) {
762 | 			pmfs_inc_count(new_dir, new_pidir);
763 | 			old_pidir = pmfs_get_inode(sb, old_dir->i_ino);
764 | 			pmfs_dec_count(old_dir, old_pidir);
765 | 		}
766 | 	}
767 | 
768 | 	pmfs_commit_transaction(sb, trans);
769 | 	return 0;
770 | out:
771 | 	pmfs_abort_transaction(sb, trans);
772 | 	return err;
773 | }
774 | 
775 | struct dentry *pmfs_get_parent(struct dentry *child)
776 | {
777 | 	struct inode *inode;
778 | 	struct qstr dotdot = QSTR_INIT("..", 2);
779 | 	struct pmfs_direntry *de = NULL;
780 | 	ino_t ino;
781 | 
782 | 	pmfs_inode_by_name(child->d_inode, &dotdot, &de);
783 | 	if (!de)
784 | 		return ERR_PTR(-ENOENT);
785 | 	ino = le64_to_cpu(de->ino);
786 | 
787 | 	if (ino)
788 | 		inode = pmfs_iget(child->d_inode->i_sb, ino);
789 | 	else
790 | 		return ERR_PTR(-ENOENT);
791 | 
792 | 	return d_obtain_alias(inode);
793 | }
794 | 
795 | const struct inode_operations pmfs_dir_inode_operations = {
796 | 	.create		= pmfs_create,
797 | 	.lookup		= pmfs_lookup,
798 | 	.link		= pmfs_link,
799 | 	.unlink		= pmfs_unlink,
800 | 	.symlink	= pmfs_symlink,
801 | 	.mkdir		= pmfs_mkdir,
802 | 	.rmdir		= pmfs_rmdir,
803 | 	.mknod		= pmfs_mknod,
804 | 	.rename		= pmfs_rename,
805 | 	.setattr	= pmfs_notify_change,
806 | 	.get_acl	= NULL,
807 | };
808 | 
809 | const struct inode_operations pmfs_special_inode_operations = {
810 | 	.setattr	= pmfs_notify_change,
811 | 	.get_acl	= NULL,
812 | };
813 | 


--------------------------------------------------------------------------------
/journal.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PMFS journaling facility. This file contains code to log changes to pmfs
  3 |  * meta-data to facilitate consistent meta-data updates against arbitrary
  4 |  * power and system failures.
  5 |  *
  6 |  * Persistent Memory File System
  7 |  * Copyright (c) 2012-2013, Intel Corporation.
  8 |  *
  9 |  * This program is free software; you can redistribute it and/or modify it
 10 |  * under the terms and conditions of the GNU General Public License,
 11 |  * version 2, as published by the Free Software Foundation.
 12 |  *
 13 |  * This program is distributed in the hope it will be useful, but WITHOUT
 14 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 15 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 16 |  * more details.
 17 |  *
 18 |  * You should have received a copy of the GNU General Public License along with
 19 |  * this program; if not, write to the Free Software Foundation, Inc.,
 20 |  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 21 |  */
 22 | 
 23 | #include <linux/module.h>
 24 | #include <linux/string.h>
 25 | #include <linux/init.h>
 26 | #include <linux/vfs.h>
 27 | #include <linux/uaccess.h>
 28 | #include <linux/mm.h>
 29 | #include <linux/mutex.h>
 30 | #include <linux/sched.h>
 31 | #include <linux/kthread.h>
 32 | #include "pmfs.h"
 33 | #include "journal.h"
 34 | 
 35 | static void dump_transaction(struct pmfs_sb_info *sbi,
 36 | 		pmfs_transaction_t *trans)
 37 | {
 38 | 	int i;
 39 | 	pmfs_logentry_t *le = trans->start_addr;
 40 | 
 41 | 	for (i = 0; i < trans->num_entries; i++) {
 42 | 		pmfs_dbg_trans("ao %llx tid %x gid %x type %x sz %x\n",
 43 | 			le->addr_offset, le->transaction_id, le->gen_id,
 44 | 			le->type, le->size);
 45 | 		le++;
 46 | 	}
 47 | }
 48 | 
 49 | static inline uint32_t next_log_entry(uint32_t jsize, uint32_t le_off)
 50 | {
 51 | 	le_off = le_off + LOGENTRY_SIZE;
 52 | 	if (le_off >= jsize)
 53 | 		le_off = 0;
 54 | 	return le_off;
 55 | }
 56 | 
 57 | static inline uint32_t prev_log_entry(uint32_t jsize, uint32_t le_off)
 58 | {
 59 | 	if (le_off == 0)
 60 | 		le_off = jsize;
 61 | 	le_off = le_off - LOGENTRY_SIZE;
 62 | 	return le_off;
 63 | }
 64 | 
 65 | static inline uint16_t next_gen_id(uint16_t gen_id)
 66 | {
 67 | 	gen_id++;
 68 | 	/* check for wraparound */
 69 | 	if (gen_id == 0)
 70 | 		gen_id++;
 71 | 	return gen_id;
 72 | }
 73 | 
 74 | static inline uint16_t prev_gen_id(uint16_t gen_id)
 75 | {
 76 | 	gen_id--;
 77 | 	/* check for wraparound */
 78 | 	if (gen_id == 0)
 79 | 		gen_id--;
 80 | 	return gen_id;
 81 | }
 82 | 
 83 | /* Undo a valid log entry */
 84 | static inline void pmfs_undo_logentry(struct super_block *sb,
 85 | 	pmfs_logentry_t *le)
 86 | {
 87 | 	char *data;
 88 | 
 89 | 	if (le->size > 0) {
 90 | 		data = pmfs_get_block(sb, le64_to_cpu(le->addr_offset));
 91 | 		/* Undo changes by flushing the log entry to pmfs */
 92 | 		pmfs_memunlock_range(sb, data, le->size);
 93 | 		memcpy(data, le->data, le->size);
 94 | 		pmfs_memlock_range(sb, data, le->size);
 95 | 		pmfs_flush_buffer(data, le->size, false);
 96 | 	}
 97 | }
 98 | 
 99 | /* can be called during journal recovery or transaction abort */
100 | /* We need to Undo in the reverse order */
101 | static void pmfs_undo_transaction(struct super_block *sb,
102 | 		pmfs_transaction_t *trans)
103 | {
104 | 	pmfs_logentry_t *le;
105 | 	int i;
106 | 	uint16_t gen_id = trans->gen_id;
107 | 
108 | 	le = trans->start_addr + trans->num_used;
109 | 	le--;
110 | 	for (i = trans->num_used - 1; i >= 0; i--, le--) {
111 | 		if (gen_id == le16_to_cpu(le->gen_id))
112 | 			pmfs_undo_logentry(sb, le);
113 | 	}
114 | }
115 | 
116 | /* can be called by either during log cleaning or during journal recovery */
117 | static void pmfs_flush_transaction(struct super_block *sb,
118 | 		pmfs_transaction_t *trans)
119 | {
120 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
121 | 	pmfs_logentry_t *le = trans->start_addr;
122 | 	int i;
123 | 	char *data;
124 | 
125 | 	for (i = 0; i < trans->num_used; i++, le++) {
126 | 		if (le->size) {
127 | 			data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
128 | 			if (sbi->redo_log) {
129 | 				pmfs_memunlock_range(sb, data, le->size);
130 | 				memcpy(data, le->data, le->size);
131 | 				pmfs_memlock_range(sb, data, le->size);
132 | 			} else
133 | 				pmfs_flush_buffer(data, le->size, false);
134 | 		}
135 | 	}
136 | }
137 | 
138 | static inline void invalidate_gen_id(pmfs_logentry_t *le)
139 | {
140 | 	le->gen_id = 0;
141 | 	pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
142 | }
143 | 
144 | /* can be called by either during log cleaning or during journal recovery */
145 | static void pmfs_invalidate_logentries(struct super_block *sb,
146 | 		pmfs_transaction_t *trans)
147 | {
148 | 	pmfs_logentry_t *le = trans->start_addr;
149 | 	int i;
150 | 
151 | 	pmfs_memunlock_range(sb, trans->start_addr,
152 | 			trans->num_entries * LOGENTRY_SIZE);
153 | 	for (i = 0; i < trans->num_entries; i++) {
154 | 		invalidate_gen_id(le);
155 | 		if (le->type == LE_START) {
156 | 			PERSISTENT_MARK();
157 | 			PERSISTENT_BARRIER();
158 | 		}
159 | 		le++;
160 | 	}
161 | 	pmfs_memlock_range(sb, trans->start_addr,
162 | 			trans->num_entries * LOGENTRY_SIZE);
163 | }
164 | 
165 | /* can be called by either during log cleaning or during journal recovery */
166 | static void pmfs_redo_transaction(struct super_block *sb,
167 | 		pmfs_transaction_t *trans, bool recover)
168 | {
169 | 	pmfs_logentry_t *le = trans->start_addr;
170 | 	int i;
171 | 	uint16_t gen_id = trans->gen_id;
172 | 	char *data;
173 | 
174 | 	for (i = 0; i < trans->num_entries; i++) {
175 | 		if (gen_id == le16_to_cpu(le->gen_id) && le->size > 0) {
176 | 			data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
177 | 			/* flush data if we are called during recovery */
178 | 			if (recover) {
179 | 				pmfs_memunlock_range(sb, data, le->size);
180 | 				memcpy(data, le->data, le->size);
181 | 				pmfs_memlock_range(sb, data, le->size);
182 | 			}
183 | 			pmfs_flush_buffer(data, le->size, false);
184 | 		}
185 | 		le++;
186 | 	}
187 | }
188 | 
189 | /* recover the transaction ending at a valid log entry *le */
190 | /* called for Undo log and traverses the journal backward */
191 | static uint32_t pmfs_recover_transaction(struct super_block *sb, uint32_t head,
192 | 		uint32_t tail, pmfs_logentry_t *le)
193 | {
194 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
195 | 	pmfs_transaction_t trans;
196 | 	bool cmt_or_abrt_found = false, start_found = false;
197 | 	uint16_t gen_id = le16_to_cpu(le->gen_id);
198 | 
199 | 	memset(&trans, 0, sizeof(trans));
200 | 	trans.transaction_id = le32_to_cpu(le->transaction_id);
201 | 	trans.gen_id = gen_id;
202 | 
203 | 	do {
204 | 		trans.num_entries++;
205 | 		trans.num_used++;
206 | 
207 | 		if (gen_id == le16_to_cpu(le->gen_id)) {
208 | 			/* Handle committed/aborted transactions */
209 | 			if (le->type & LE_COMMIT || le->type & LE_ABORT)
210 | 				cmt_or_abrt_found = true;
211 | 			if (le->type & LE_START) {
212 | 				trans.start_addr = le;
213 | 				start_found = true;
214 | 				break;
215 | 			}
216 | 		}
217 | 		if (tail == 0 || tail == head)
218 | 		    break;
219 | 		/* prev log entry */
220 | 		le--;
221 | 		/* Handle uncommitted transactions */
222 | 		if ((gen_id == le16_to_cpu(le->gen_id))
223 | 			&& (le->type & LE_COMMIT || le->type & LE_ABORT)) {
224 | 			BUG_ON(trans.transaction_id == 
225 | 				le32_to_cpu(le->transaction_id));
226 | 			le++;
227 | 			break;
228 | 		}
229 | 		tail = prev_log_entry(sbi->jsize, tail);
230 | 	} while (1);
231 | 
232 | 	if (start_found && !cmt_or_abrt_found)
233 | 		pmfs_undo_transaction(sb, &trans);
234 | 
235 | 	if (gen_id == MAX_GEN_ID) {
236 | 		if (!start_found)
237 | 			trans.start_addr = le;
238 | 		/* make sure the changes made by pmfs_undo_transaction() are
239 | 		 * persistent before invalidating the log entries */
240 | 		if (start_found && !cmt_or_abrt_found) {
241 | 			PERSISTENT_MARK();
242 | 			PERSISTENT_BARRIER();
243 | 		}
244 | 		pmfs_invalidate_logentries(sb, &trans);
245 | 	}
246 | 	return tail;
247 | }
248 | 
249 | /* process the transaction starting at a valid log entry *le */
250 | /* called by the log cleaner and journal recovery */
251 | static uint32_t pmfs_process_transaction(struct super_block *sb, uint32_t head,
252 | 	uint32_t tail, pmfs_logentry_t *le, bool recover, int *processed)
253 | {
254 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
255 | 	pmfs_transaction_t trans;
256 | 	uint16_t gen_id;
257 | 	uint32_t new_head = head;
258 | 	int handled = 0;
259 | 
260 | 	*processed = 0;
261 | 	gen_id = le16_to_cpu(le->gen_id);
262 | 	if (!(le->type & LE_START)) {
263 | 		pmfs_dbg("start of trans %x but LE_START not set. gen_id %d\n",
264 | 				le32_to_cpu(le->transaction_id), gen_id);
265 | 		return next_log_entry(sbi->jsize, new_head);
266 | 	}
267 | 	memset(&trans, 0, sizeof(trans));
268 | 	trans.transaction_id = le32_to_cpu(le->transaction_id);
269 | 	trans.start_addr = le;
270 | 	trans.gen_id = gen_id;
271 | 	do {
272 | 		trans.num_entries++;
273 | 		trans.num_used++;
274 | 		new_head = next_log_entry(sbi->jsize, new_head);
275 | 		handled++;
276 | 
277 | 		/* Handle committed/aborted transactions */
278 | 		if ((gen_id == le16_to_cpu(le->gen_id)) && (le->type & LE_COMMIT
279 | 					|| le->type & LE_ABORT)) {
280 | 			head = new_head;
281 | 			if ((le->type & LE_COMMIT) && sbi->redo_log)
282 | 				pmfs_redo_transaction(sb, &trans, recover);
283 | 
284 | 			if (gen_id == MAX_GEN_ID) {
285 | 				if ((le->type & LE_COMMIT) && sbi->redo_log) {
286 | 					PERSISTENT_MARK();
287 | 					PERSISTENT_BARRIER();
288 | 				}
289 | 				pmfs_invalidate_logentries(sb, &trans);
290 | 			}
291 | 			break;
292 | 		}
293 | 		/* next log entry */
294 | 		le++;
295 | 		/* Handle uncommitted transactions */
296 | 		if ((new_head == tail) || ((gen_id == le16_to_cpu(le->gen_id))
297 | 			    && (le->type & LE_START))) {
298 | 			/* found a new valid transaction w/o finding a commit */
299 | 			if (recover) {
300 | 				/* if this function is called by recovery, move
301 | 				 * ahead even if we didn't find a commit record
302 | 				 * for this transaction */
303 | 				head = new_head;
304 | 				if (gen_id == MAX_GEN_ID)
305 | 					pmfs_invalidate_logentries(sb, &trans);
306 | 			}
307 | 			pmfs_dbg_trans("no cmt tid %d sa %p nle %d tail %x"
308 | 			" gen %d\n",
309 | 			trans.transaction_id,trans.start_addr,trans.num_entries,
310 | 			trans.num_used, trans.gen_id);
311 | 			/* dump_transaction(sbi, &trans); */
312 | 			break;
313 | 		}
314 | 	} while (new_head != tail);
315 | 
316 | 	*processed = handled;
317 | 	return head;
318 | }
319 | 
320 | static int pmfs_clean_journal(struct super_block *sb, bool unmount,
321 | 	int take_lock)
322 | {
323 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
324 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
325 | 	uint32_t head;
326 | 	uint32_t new_head, tail;
327 | 	uint16_t gen_id;
328 | 	volatile __le64 *ptr_tail_genid;
329 | 	int processed = 0;
330 | 	int total = 0;
331 | 	u64 tail_genid;
332 | 	pmfs_logentry_t *le;
333 | 
334 | 	if (take_lock)
335 | 		mutex_lock(&sbi->journal_mutex);
336 | 	head = le32_to_cpu(journal->head);
337 | 	ptr_tail_genid = (volatile __le64 *)&journal->tail;
338 | 
339 | 	/* atomically read both tail and gen_id of journal. Normally use of
340 | 	 * volatile is prohibited in kernel code but since we use volatile
341 | 	 * to write to journal's tail and gen_id atomically, we thought we
342 | 	 * should use volatile to read them simultaneously and avoid locking
343 | 	 * them. */
344 | 	tail_genid = le64_to_cpu(*ptr_tail_genid);
345 | 	tail = tail_genid & 0xFFFFFFFF;
346 | 	gen_id = (tail_genid >> 32) & 0xFFFF;
347 | 
348 | 	/* journal wraparound happened. so head points to prev generation id */
349 | 	if (tail < head)
350 | 		gen_id = prev_gen_id(gen_id);
351 | 	pmfs_dbg_trans("starting journal cleaning %x %x\n", head, tail);
352 | 	while (head != tail) {
353 | 		le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
354 | 		if (gen_id == le16_to_cpu(le->gen_id)) {
355 | 			/* found a valid log entry, process the transaction */
356 | 			new_head = pmfs_process_transaction(sb, head, tail,
357 | 				le, false, &processed);
358 | 			total += processed;
359 | 			/* no progress was made. return */
360 | 			if (new_head == head)
361 | 				break;
362 | 			head = new_head;
363 | 		} else {
364 | 			if (gen_id == MAX_GEN_ID) {
365 | 				pmfs_memunlock_range(sb, le, sizeof(*le));
366 | 				invalidate_gen_id(le);
367 | 				pmfs_memlock_range(sb, le, sizeof(*le));
368 | 			}
369 | 			head = next_log_entry(sbi->jsize, head);
370 | 		}
371 | 		/* handle journal wraparound */
372 | 		if (head == 0)
373 | 			gen_id = next_gen_id(gen_id);
374 | 	}
375 | 	PERSISTENT_MARK();
376 | 	PERSISTENT_BARRIER();
377 | 	pmfs_memunlock_range(sb, journal, sizeof(*journal));
378 | 	journal->head = cpu_to_le32(head);
379 | 	pmfs_memlock_range(sb, journal, sizeof(*journal));
380 | 	pmfs_flush_buffer(&journal->head, sizeof(journal->head), true);
381 | 	if (unmount) {
382 | 		PERSISTENT_MARK();
383 | 		if (journal->head != journal->tail)
384 | 			pmfs_dbg("PMFS: umount but journal not empty %x:%x\n",
385 | 			le32_to_cpu(journal->head), le32_to_cpu(journal->tail));
386 | 		PERSISTENT_BARRIER();
387 | 	}
388 | 	pmfs_dbg_trans("leaving journal cleaning %x %x\n", head, tail);
389 | 	if (take_lock)
390 | 		mutex_unlock(&sbi->journal_mutex);
391 | 	return total;
392 | }
393 | 
394 | static void log_cleaner_try_sleeping(struct  pmfs_sb_info *sbi)
395 | {
396 | 	DEFINE_WAIT(wait);
397 | 	prepare_to_wait(&sbi->log_cleaner_wait, &wait, TASK_INTERRUPTIBLE);
398 | 	schedule();
399 | 	finish_wait(&sbi->log_cleaner_wait, &wait);
400 | }
401 | 
402 | static int pmfs_log_cleaner(void *arg)
403 | {
404 | 	struct super_block *sb = (struct super_block *)arg;
405 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
406 | 
407 | 	pmfs_dbg_trans("Running log cleaner thread\n");
408 | 	for ( ; ; ) {
409 | 		log_cleaner_try_sleeping(sbi);
410 | 
411 | 		if (kthread_should_stop())
412 | 			break;
413 | 
414 | 		pmfs_clean_journal(sb, false, 1);
415 | 	}
416 | 	pmfs_clean_journal(sb, true, 1);
417 | 	pmfs_dbg_trans("Exiting log cleaner thread\n");
418 | 	return 0;
419 | }
420 | 
421 | static int pmfs_journal_cleaner_run(struct super_block *sb)
422 | {
423 | 	int ret = 0;
424 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
425 | 
426 | 	init_waitqueue_head(&sbi->log_cleaner_wait);
427 | 
428 | 	sbi->log_cleaner_thread = kthread_run(pmfs_log_cleaner, sb,
429 | 			"pmfs_log_cleaner_0x%llx", sbi->phys_addr);
430 | 	if (IS_ERR(sbi->log_cleaner_thread)) {
431 | 		/* failure at boot is fatal */
432 | 		pmfs_err(sb, "Failed to start pmfs log cleaner thread\n");
433 | 		ret = -1;
434 | 	}
435 | 	return ret;
436 | }
437 | 
438 | int pmfs_journal_soft_init(struct super_block *sb)
439 | {
440 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
441 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
442 | 
443 | 	sbi->next_transaction_id = 0;
444 | 	sbi->journal_base_addr = pmfs_get_block(sb,le64_to_cpu(journal->base));
445 | 	sbi->jsize = le32_to_cpu(journal->size);
446 | 	mutex_init(&sbi->journal_mutex);
447 | 	sbi->redo_log = !!le16_to_cpu(journal->redo_logging);
448 | 
449 | 	return pmfs_journal_cleaner_run(sb);
450 | }
451 | 
452 | int pmfs_journal_hard_init(struct super_block *sb, uint64_t base,
453 | 	uint32_t size)
454 | {
455 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
456 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
457 | 
458 | 	pmfs_memunlock_range(sb, journal, sizeof(*journal));
459 | 	journal->base = cpu_to_le64(base);
460 | 	journal->size = cpu_to_le32(size);
461 | 	journal->gen_id = cpu_to_le16(1);
462 | 	journal->head = journal->tail = 0;
463 | 	/* lets do Undo logging for now */
464 | 	journal->redo_logging = 0;
465 | 	pmfs_memlock_range(sb, journal, sizeof(*journal));
466 | 
467 | 	sbi->journal_base_addr = pmfs_get_block(sb, base);
468 | 	pmfs_memunlock_range(sb, sbi->journal_base_addr, size);
469 | 	memset_nt(sbi->journal_base_addr, 0, size);
470 | 	pmfs_memlock_range(sb, sbi->journal_base_addr, size);
471 | 
472 | 	return pmfs_journal_soft_init(sb);
473 | }
474 | 
475 | static void wakeup_log_cleaner(struct pmfs_sb_info *sbi)
476 | {
477 | 	if (!waitqueue_active(&sbi->log_cleaner_wait))
478 | 		return;
479 | 	pmfs_dbg_trans("waking up the cleaner thread\n");
480 | 	wake_up_interruptible(&sbi->log_cleaner_wait);
481 | }
482 | 
483 | int pmfs_journal_uninit(struct super_block *sb)
484 | {
485 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
486 | 
487 | 	if (sbi->log_cleaner_thread)
488 | 		kthread_stop(sbi->log_cleaner_thread);
489 | 	return 0;
490 | }
491 | 
492 | inline pmfs_transaction_t *pmfs_current_transaction(void)
493 | {
494 | 	return (pmfs_transaction_t *)current->journal_info;
495 | }
496 | 
497 | static int pmfs_free_logentries(struct super_block *sb, int max_log_entries)
498 | {
499 | 	int freed_entries = 0;
500 | 
501 | 	freed_entries = pmfs_clean_journal(sb, false, 0);
502 | 	return LOGENTRY_SIZE * freed_entries;
503 | }
504 | 
505 | pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
506 | 		int max_log_entries)
507 | {
508 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
509 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
510 | 	pmfs_transaction_t *trans;
511 | 	uint32_t head, tail, req_size, avail_size, freed_size;
512 | 	uint64_t base;
513 | 	int retry = 0;
514 | 	timing_t log_time;
515 | #if 0
516 | 	trans = pmfs_current_transaction();
517 | 
518 | 	if (trans) {
519 | 		BUG_ON(trans->t_journal != journal);
520 | 		return trans;
521 | 	}
522 | #endif
523 | 	/* If it is an undo log, need one more log-entry for commit record */
524 | 	PMFS_START_TIMING(new_trans_t, log_time);
525 | 
526 | 	if (!sbi->redo_log)
527 | 		max_log_entries++;
528 | 
529 | 	trans = pmfs_alloc_transaction();
530 | 	if (!trans)
531 | 		return ERR_PTR(-ENOMEM);
532 | 	memset(trans, 0, sizeof(*trans));
533 | 
534 | 	trans->num_used = 0;
535 | 	trans->num_entries = max_log_entries;
536 | 	trans->t_journal = journal;
537 | 	req_size = max_log_entries << LESIZE_SHIFT;
538 | 
539 | 	mutex_lock(&sbi->journal_mutex);
540 | 
541 | 	tail = le32_to_cpu(journal->tail);
542 | 	head = le32_to_cpu(journal->head);
543 | 	trans->transaction_id = sbi->next_transaction_id++;
544 | again:
545 | 	trans->gen_id = le16_to_cpu(journal->gen_id);
546 | 	avail_size = (tail >= head) ?
547 | 		(sbi->jsize - (tail - head)) : (head - tail);
548 | 	avail_size = avail_size - LOGENTRY_SIZE;
549 | 
550 | 	if (avail_size < req_size) {
551 | 		/* run the log cleaner function to free some log entries */
552 | 		freed_size = 0;
553 | 		for (retry = 0; retry < 3; retry++) {
554 | 			freed_size += pmfs_free_logentries(sb,
555 | 						max_log_entries);
556 | 			if ((avail_size + freed_size) >= req_size)
557 | 				break;
558 | 		}
559 | 
560 | 		if ((avail_size + freed_size) < req_size)
561 | 			goto journal_full;
562 | 	}
563 | 	base = le64_to_cpu(journal->base) + tail;
564 | 	tail = tail + req_size;
565 | 	/* journal wraparound because of this transaction allocation.
566 | 	 * start the transaction from the beginning of the journal so
567 | 	 * that we don't have any wraparound within a transaction */
568 | 	pmfs_memunlock_range(sb, journal, sizeof(*journal));
569 | 	if (tail >= sbi->jsize) {
570 | 		u64 *ptr;
571 | 		tail = 0;
572 | 		ptr = (u64 *)&journal->tail;
573 | 		/* writing 8-bytes atomically setting tail to 0 */
574 | 		set_64bit(ptr, (__force u64)cpu_to_le64((u64)next_gen_id(
575 | 					le16_to_cpu(journal->gen_id)) << 32));
576 | 		pmfs_memlock_range(sb, journal, sizeof(*journal));
577 | 		pmfs_dbg_trans("journal wrapped. tail %x gid %d cur tid %d\n",
578 | 			le32_to_cpu(journal->tail),le16_to_cpu(journal->gen_id),
579 | 				sbi->next_transaction_id - 1);
580 | 		goto again;
581 | 	} else {
582 | 		journal->tail = cpu_to_le32(tail);
583 | 		pmfs_memlock_range(sb, journal, sizeof(*journal));
584 | 	}
585 | 	pmfs_flush_buffer(&journal->tail, sizeof(u64), false);
586 | 	mutex_unlock(&sbi->journal_mutex);
587 | 
588 | 	avail_size = avail_size - req_size;
589 | 	/* wake up the log cleaner if required */
590 | 	if ((sbi->jsize - avail_size) > (sbi->jsize >> 3))
591 | 		wakeup_log_cleaner(sbi);
592 | 
593 | 	pmfs_dbg_trans("new transaction tid %d nle %d avl sz %x sa %llx\n",
594 | 		trans->transaction_id, max_log_entries, avail_size, base);
595 | 	trans->start_addr = pmfs_get_block(sb, base);
596 | 
597 | 	trans->parent = (pmfs_transaction_t *)current->journal_info;
598 | 	current->journal_info = trans;
599 | 	PMFS_END_TIMING(new_trans_t, log_time);
600 | 	return trans;
601 | journal_full:
602 | 	mutex_unlock(&sbi->journal_mutex);
603 | 	pmfs_err(sb, "Journal full. base %llx sz %x head:tail %x:%x ncl %x\n",
604 | 		le64_to_cpu(journal->base), le32_to_cpu(journal->size),
605 | 		le32_to_cpu(journal->head), le32_to_cpu(journal->tail),
606 | 		max_log_entries);
607 | 	pmfs_err(sb, "avail size %u, freed size %u, request size %u\n",
608 | 		avail_size, freed_size, req_size);
609 | 	pmfs_free_transaction(trans);
610 | 	PMFS_END_TIMING(new_trans_t, log_time);
611 | 	return ERR_PTR(-EAGAIN);
612 | }
613 | 
614 | static inline void pmfs_commit_logentry(struct super_block *sb,
615 | 		pmfs_transaction_t *trans, pmfs_logentry_t *le)
616 | {
617 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
618 | 	if (sbi->redo_log) {
619 | 		/* Redo Log */
620 | 		PERSISTENT_MARK();
621 | 		PERSISTENT_BARRIER();
622 | 		/* Atomically write the commit type */
623 | 		le->type |= LE_COMMIT;
624 | 		barrier();
625 | 		/* Atomically make the log entry valid */
626 | 		le->gen_id = cpu_to_le16(trans->gen_id);
627 | 		pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
628 | 		PERSISTENT_MARK();
629 | 		PERSISTENT_BARRIER();
630 | 		/* Update the FS in place */
631 | 		pmfs_flush_transaction(sb, trans);
632 | 	} else {
633 | 		/* Undo Log */
634 | 		/* Update the FS in place: currently already done. so
635 | 		 * only need to clflush */
636 | 		pmfs_flush_transaction(sb, trans);
637 | 		PERSISTENT_MARK();
638 | 		PERSISTENT_BARRIER();
639 | 		/* Atomically write the commit type */
640 | 		le->type |= LE_COMMIT;
641 | 		barrier();
642 | 		/* Atomically make the log entry valid */
643 | 		le->gen_id = cpu_to_le16(trans->gen_id);
644 | 		pmfs_flush_buffer(le, LOGENTRY_SIZE, true);
645 | 	}
646 | }
647 | 
648 | int pmfs_add_logentry(struct super_block *sb,
649 | 		pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type)
650 | {
651 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
652 | 	pmfs_logentry_t *le;
653 | 	int num_les = 0, i;
654 | 	uint64_t le_start = size ? pmfs_get_addr_off(sbi, addr) : 0;
655 | 	uint8_t le_size;
656 | 	timing_t add_log_time;
657 | 
658 | 	if (trans == NULL)
659 | 		return -EINVAL;
660 | 
661 | 	PMFS_START_TIMING(add_log_t, add_log_time);
662 | 	le = trans->start_addr + trans->num_used;
663 | 
664 | 	if (size == 0) {
665 | 		/* At least one log entry required for commit/abort log entry */
666 | 		if ((type & LE_COMMIT) || (type & LE_ABORT))
667 | 			num_les = 1;
668 | 	} else
669 | 		num_les = (size + sizeof(le->data) - 1)/sizeof(le->data);
670 | 
671 | 	pmfs_dbg_trans("add le id %d size %x, num_les %d tail %x le %p\n",
672 | 		trans->transaction_id, size, trans->num_entries,
673 | 		trans->num_used, le);
674 | 
675 | 	if ((trans->num_used + num_les) > trans->num_entries) {
676 | 		pmfs_err(sb, "Log Entry full. tid %x ne %x tail %x size %x\n",
677 | 			trans->transaction_id, trans->num_entries,
678 | 			trans->num_used, size);
679 | 		dump_transaction(sbi, trans);
680 | 		dump_stack();
681 | 		return -ENOMEM;
682 | 	}
683 | 
684 | 	pmfs_memunlock_range(sb, le, sizeof(*le) * num_les);
685 | 	for (i = 0; i < num_les; i++) {
686 | 		le->addr_offset = cpu_to_le64(le_start);
687 | 		le->transaction_id = cpu_to_le32(trans->transaction_id);
688 | 		le_size = (i == (num_les - 1)) ? size : sizeof(le->data);
689 | 		le->size = le_size;
690 | 		size -= le_size;
691 | 		if (le_size)
692 | 			memcpy(le->data, addr, le_size);
693 | 		le->type = type;
694 | 
695 | 		if (i == 0 && trans->num_used == 0)
696 | 			le->type |= LE_START;
697 | 		trans->num_used++;
698 | 
699 | 		/* handle special log entry */
700 | 		if (i == (num_les - 1) && (type & LE_COMMIT)) {
701 | 			pmfs_commit_logentry(sb, trans, le);
702 | 			pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
703 | 			PMFS_END_TIMING(add_log_t, add_log_time);
704 | 			return 0;
705 | 		}
706 | 		/* put a compile time barrier so that compiler doesn't reorder
707 | 		 * the writes to the log entry */
708 | 		barrier();
709 | 
710 | 		/* Atomically make the log entry valid */
711 | 		le->gen_id = cpu_to_le16(trans->gen_id);
712 | 		pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
713 | 
714 | 		addr += le_size;
715 | 		le_start += le_size;
716 | 		le++;
717 | 	}
718 | 	pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
719 | 	if (!sbi->redo_log) {
720 | 		PERSISTENT_MARK();
721 | 		PERSISTENT_BARRIER();
722 | 	}
723 | 	PMFS_END_TIMING(add_log_t, add_log_time);
724 | 	return 0;
725 | }
726 | 
727 | int pmfs_commit_transaction(struct super_block *sb,
728 | 		pmfs_transaction_t *trans)
729 | {
730 | 	timing_t commit_time;
731 | 
732 | 	if (trans == NULL)
733 | 		return 0;
734 | 	/* Add the commit log-entry */
735 | 	pmfs_add_logentry(sb, trans, NULL, 0, LE_COMMIT);
736 | 
737 | 	PMFS_START_TIMING(commit_trans_t, commit_time);
738 | 	pmfs_dbg_trans("completing transaction for id %d\n",
739 | 		trans->transaction_id);
740 | 
741 | 	current->journal_info = trans->parent;
742 | 	pmfs_free_transaction(trans);
743 | 	PMFS_END_TIMING(commit_trans_t, commit_time);
744 | 	return 0;
745 | }
746 | 
747 | int pmfs_abort_transaction(struct super_block *sb, pmfs_transaction_t *trans)
748 | {
749 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
750 | 
751 | 	if (trans == NULL)
752 | 		return 0;
753 | 	pmfs_dbg_trans("abort trans for tid %x sa %p numle %d tail %x gen %d\n",
754 | 		trans->transaction_id, trans->start_addr, trans->num_entries,
755 | 		trans->num_used, trans->gen_id);
756 | 	dump_transaction(sbi, trans);
757 | 	/*dump_stack();*/
758 | 
759 | 	if (!sbi->redo_log) {
760 | 		/* Undo Log */
761 | 		pmfs_undo_transaction(sb, trans);
762 | 		PERSISTENT_MARK();
763 | 		PERSISTENT_BARRIER();
764 | 	}
765 | 	/* add a abort log entry */
766 | 	pmfs_add_logentry(sb, trans, NULL, 0, LE_ABORT);
767 | 	current->journal_info = trans->parent;
768 | 	pmfs_free_transaction(trans);
769 | 	return 0;
770 | }
771 | 
772 | static void invalidate_remaining_journal(struct super_block *sb,
773 | 	void *journal_vaddr, uint32_t jtail, uint32_t jsize)
774 | {
775 | 	pmfs_logentry_t *le = (pmfs_logentry_t *)(journal_vaddr + jtail);
776 | 	void *start = le;
777 | 
778 | 	pmfs_memunlock_range(sb, start, jsize - jtail);
779 | 	while (jtail < jsize) {
780 | 		invalidate_gen_id(le);
781 | 		le++;
782 | 		jtail += LOGENTRY_SIZE;
783 | 	}
784 | 	pmfs_memlock_range(sb, start, jsize - jtail);
785 | }
786 | 
787 | /* we need to increase the gen_id to invalidate all the journal log
788 |  * entries. This is because after the recovery, we may still have some
789 |  * valid log entries beyond the tail (before power failure, they became
790 |  * persistent before the journal tail could become persistent.
791 |  * should gen_id and head be updated atomically? not necessarily? we
792 |  * can update gen_id before journal head because gen_id and head are in
793 |  * the same cacheline */
794 | static void pmfs_forward_journal(struct super_block *sb, struct pmfs_sb_info
795 | 		*sbi, pmfs_journal_t *journal)
796 | {
797 | 	uint16_t gen_id = le16_to_cpu(journal->gen_id);
798 | 	/* handle gen_id wrap around */
799 | 	if (gen_id == MAX_GEN_ID) {
800 | 		invalidate_remaining_journal(sb, sbi->journal_base_addr,
801 | 			le32_to_cpu(journal->tail), sbi->jsize);
802 | 	}
803 | 	PERSISTENT_MARK();
804 | 	gen_id = next_gen_id(gen_id);
805 | 	/* make all changes persistent before advancing gen_id and head */
806 | 	PERSISTENT_BARRIER();
807 | 	pmfs_memunlock_range(sb, journal, sizeof(*journal));
808 | 	journal->gen_id = cpu_to_le16(gen_id);
809 | 	barrier();
810 | 	journal->head = journal->tail;
811 | 	pmfs_memlock_range(sb, journal, sizeof(*journal));
812 | 	pmfs_flush_buffer(journal, sizeof(*journal), false);
813 | }
814 | 
815 | static int pmfs_recover_undo_journal(struct super_block *sb)
816 | {
817 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
818 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
819 | 	uint32_t tail = le32_to_cpu(journal->tail);
820 | 	uint32_t head = le32_to_cpu(journal->head);
821 | 	uint16_t gen_id = le16_to_cpu(journal->gen_id);
822 | 	pmfs_logentry_t *le;
823 | 
824 | 	while (head != tail) {
825 | 		/* handle journal wraparound */
826 | 		if (tail == 0)
827 | 			gen_id = prev_gen_id(gen_id);
828 | 		tail = prev_log_entry(sbi->jsize, tail);
829 | 
830 | 		le = (pmfs_logentry_t *)(sbi->journal_base_addr + tail);
831 | 		if (gen_id == le16_to_cpu(le->gen_id)) {
832 | 			tail = pmfs_recover_transaction(sb, head, tail, le);
833 | 		} else {
834 | 			if (gen_id == MAX_GEN_ID) {
835 | 				pmfs_memunlock_range(sb, le, sizeof(*le));
836 | 				invalidate_gen_id(le);
837 | 				pmfs_memlock_range(sb, le, sizeof(*le));
838 | 			}
839 | 		}
840 | 	}
841 | 	pmfs_forward_journal(sb, sbi, journal);
842 | 	PERSISTENT_MARK();
843 | 	PERSISTENT_BARRIER();
844 | 	return 0;
845 | }
846 | 
847 | static int pmfs_recover_redo_journal(struct super_block *sb)
848 | {
849 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
850 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
851 | 	uint32_t tail = le32_to_cpu(journal->tail);
852 | 	uint32_t head = le32_to_cpu(journal->head);
853 | 	uint16_t gen_id = le16_to_cpu(journal->gen_id);
854 | 	int processed = 0;
855 | 	pmfs_logentry_t *le;
856 | 
857 | 	/* journal wrapped around. so head points to previous generation id */
858 | 	if (tail < head)
859 | 		gen_id = prev_gen_id(gen_id);
860 | 
861 | 	while (head != tail) {
862 | 		le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
863 | 		if (gen_id == le16_to_cpu(le->gen_id)) {
864 | 			head = pmfs_process_transaction(sb, head, tail,
865 | 				le, true, &processed);
866 | 		} else {
867 | 			if (gen_id == MAX_GEN_ID) {
868 | 				pmfs_memunlock_range(sb, le, sizeof(*le));
869 | 				invalidate_gen_id(le);
870 | 				pmfs_memlock_range(sb, le, sizeof(*le));
871 | 			}
872 | 			head = next_log_entry(sbi->jsize, head);
873 | 		}
874 | 		/* handle journal wraparound */
875 | 		if (head == 0)
876 | 			gen_id = next_gen_id(gen_id);
877 | 	}
878 | 	pmfs_forward_journal(sb, sbi, journal);
879 | 	PERSISTENT_MARK();
880 | 	PERSISTENT_BARRIER();
881 | 	return 0;
882 | }
883 | 
884 | int pmfs_recover_journal(struct super_block *sb)
885 | {
886 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
887 | 	pmfs_journal_t *journal = pmfs_get_journal(sb);
888 | 	uint32_t tail = le32_to_cpu(journal->tail);
889 | 	uint32_t head = le32_to_cpu(journal->head);
890 | 	uint16_t gen_id = le16_to_cpu(journal->gen_id);
891 | 
892 | 	/* is the journal empty? true if unmounted properly. */
893 | 	if (head == tail)
894 | 		return 0;
895 | 	pmfs_dbg("PMFS: journal recovery. head:tail %x:%x gen_id %d\n",
896 | 		head, tail, gen_id);
897 | 	if (sbi->redo_log)
898 | 		pmfs_recover_redo_journal(sb);
899 | 	else
900 | 		pmfs_recover_undo_journal(sb);
901 | 	return 0;
902 | }
903 | 
904 | 


--------------------------------------------------------------------------------
/super.c:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * BRIEF DESCRIPTION
   3 |  *
   4 |  * Super block operations.
   5 |  *
   6 |  * Copyright 2012-2013 Intel Corporation
   7 |  * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
   8 |  * Copyright 2003 Sony Corporation
   9 |  * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
  10 |  * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
  11 |  * This file is licensed under the terms of the GNU General Public
  12 |  * License version 2. This program is licensed "as is" without any
  13 |  * warranty of any kind, whether express or implied.
  14 |  */
  15 | 
  16 | #include <linux/module.h>
  17 | #include <linux/string.h>
  18 | #include <linux/slab.h>
  19 | #include <linux/init.h>
  20 | #include <linux/parser.h>
  21 | #include <linux/vfs.h>
  22 | #include <linux/uaccess.h>
  23 | #include <linux/io.h>
  24 | #include <linux/seq_file.h>
  25 | #include <linux/mount.h>
  26 | #include <linux/mm.h>
  27 | #include <linux/ctype.h>
  28 | #include <linux/bitops.h>
  29 | #include <linux/magic.h>
  30 | #include <linux/exportfs.h>
  31 | #include <linux/random.h>
  32 | #include <linux/cred.h>
  33 | #include <linux/backing-dev.h>
  34 | #include <linux/list.h>
  35 | #include <linux/dax.h>
  36 | #include "pmfs.h"
  37 | 
  38 | int measure_timing = 0;
  39 | int support_clwb = 0;
  40 | int support_pcommit = 0;
  41 | 
  42 | module_param(measure_timing, int, S_IRUGO);
  43 | MODULE_PARM_DESC(measure_timing, "Timing measurement");
  44 | 
  45 | static struct super_operations pmfs_sops;
  46 | static const struct export_operations pmfs_export_ops;
  47 | static struct kmem_cache *pmfs_inode_cachep;
  48 | static struct kmem_cache *pmfs_blocknode_cachep;
  49 | static struct kmem_cache *pmfs_transaction_cachep;
  50 | /* FIXME: should the following variable be one per PMFS instance? */
  51 | unsigned int pmfs_dbgmask = 0;
  52 | 
  53 | #ifdef CONFIG_PMFS_TEST
  54 | static void *first_pmfs_super;
  55 | 
  56 | struct pmfs_super_block *get_pmfs_super(void)
  57 | {
  58 | 	return (struct pmfs_super_block *)first_pmfs_super;
  59 | }
  60 | EXPORT_SYMBOL(get_pmfs_super);
  61 | #endif
  62 | 
  63 | void pmfs_error_mng(struct super_block *sb, const char *fmt, ...)
  64 | {
  65 | 	va_list args;
  66 | 
  67 | 	printk("pmfs error: ");
  68 | 	va_start(args, fmt);
  69 | 	vprintk(fmt, args);
  70 | 	va_end(args);
  71 | 
  72 | 	if (test_opt(sb, ERRORS_PANIC))
  73 | 		panic("pmfs: panic from previous error\n");
  74 | 	if (test_opt(sb, ERRORS_RO)) {
  75 | 		printk(KERN_CRIT "pmfs err: remounting filesystem read-only");
  76 | 		sb->s_flags |= MS_RDONLY;
  77 | 	}
  78 | }
  79 | 
  80 | static void pmfs_set_blocksize(struct super_block *sb, unsigned long size)
  81 | {
  82 | 	int bits;
  83 | 
  84 | 	/*
  85 | 	 * We've already validated the user input and the value here must be
  86 | 	 * between PMFS_MAX_BLOCK_SIZE and PMFS_MIN_BLOCK_SIZE
  87 | 	 * and it must be a power of 2.
  88 | 	 */
  89 | 	bits = fls(size) - 1;
  90 | 	sb->s_blocksize_bits = bits;
  91 | 	sb->s_blocksize = (1 << bits);
  92 | }
  93 | 
  94 | static inline int pmfs_has_huge_ioremap(struct super_block *sb)
  95 | {
  96 | 	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
  97 | 
  98 | 	return sbi->s_mount_opt & PMFS_MOUNT_HUGEIOREMAP;
  99 | }
 100 | 
 101 | static int pmfs_get_block_info(struct super_block *sb,
 102 | 	struct pmfs_sb_info *sbi)
 103 | {
 104 | 	struct dax_device *dax_dev;
 105 | 	void *virt_addr = NULL;
 106 | 	pfn_t __pfn_t;
 107 | 	long size;
 108 | 	int ret;
 109 | 
 110 | 	ret = bdev_dax_supported(sb, PAGE_SIZE);
 111 | 	if (ret) {
 112 | 		pmfs_err(sb, "device does not support DAX: %d\n", ret);
 113 | 		return ret;
 114 | 	}
 115 | 
 116 | 	sbi->s_bdev = sb->s_bdev;
 117 | 	dax_dev = fs_dax_get_by_host(sb->s_bdev->bd_disk->disk_name);
 118 | 	if (!dax_dev) {
 119 | 		pmfs_err(sb, "Couldn't retrieve DAX device\n");
 120 | 		return -EINVAL;
 121 | 	}
 122 | 
 123 | 	size = dax_direct_access(dax_dev, 0, LONG_MAX / PAGE_SIZE,
 124 | 				&virt_addr, &__pfn_t) * PAGE_SIZE;
 125 | 	if (size <= 0) {
 126 | 		pmfs_err(sb, "direct_access failed\n");
 127 | 		return -EINVAL;
 128 | 	}
 129 | 
 130 | 	sbi->virt_addr = virt_addr;
 131 | 	sbi->phys_addr = pfn_t_to_pfn(__pfn_t) << PAGE_SHIFT;
 132 | 	sbi->initsize = size;
 133 | 
 134 | 	return 0;
 135 | }
 136 | 
 137 | static loff_t pmfs_max_size(int bits)
 138 | {
 139 | 	loff_t res;
 140 | 
 141 | 	res = (1ULL << (3 * 9 + bits)) - 1;
 142 | 
 143 | 	if (res > MAX_LFS_FILESIZE)
 144 | 		res = MAX_LFS_FILESIZE;
 145 | 
 146 | 	pmfs_dbg_verbose("max file size %llu bytes\n", res);
 147 | 	return res;
 148 | }
 149 | 
 150 | enum {
 151 | 	Opt_bpi, Opt_init, Opt_jsize,
 152 | 	Opt_num_inodes, Opt_mode, Opt_uid,
 153 | 	Opt_gid, Opt_blocksize, Opt_wprotect, Opt_wprotectold,
 154 | 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 155 | 	Opt_hugemmap, Opt_nohugeioremap, Opt_dbgmask, Opt_bs, Opt_err
 156 | };
 157 | 
 158 | static const match_table_t tokens = {
 159 | 	{ Opt_bpi,	     "bpi=%u"		  },
 160 | 	{ Opt_init,	     "init"		  },
 161 | 	{ Opt_jsize,     "jsize=%s"		  },
 162 | 	{ Opt_num_inodes,"num_inodes=%u"  },
 163 | 	{ Opt_mode,	     "mode=%o"		  },
 164 | 	{ Opt_uid,	     "uid=%u"		  },
 165 | 	{ Opt_gid,	     "gid=%u"		  },
 166 | 	{ Opt_wprotect,	     "wprotect"		  },
 167 | 	{ Opt_wprotectold,   "wprotectold"	  },
 168 | 	{ Opt_err_cont,	     "errors=continue"	  },
 169 | 	{ Opt_err_panic,     "errors=panic"	  },
 170 | 	{ Opt_err_ro,	     "errors=remount-ro"  },
 171 | 	{ Opt_hugemmap,	     "hugemmap"		  },
 172 | 	{ Opt_nohugeioremap, "nohugeioremap"	  },
 173 | 	{ Opt_dbgmask,	     "dbgmask=%u"	  },
 174 | 	{ Opt_bs,	     "backing_dev=%s"	  },
 175 | 	{ Opt_err,	     NULL		  },
 176 | };
 177 | 
 178 | static int pmfs_parse_options(char *options, struct pmfs_sb_info *sbi,
 179 | 			       bool remount)
 180 | {
 181 | 	char *p, *rest;
 182 | 	substring_t args[MAX_OPT_ARGS];
 183 | 	int option;
 184 | 
 185 | 	if (!options)
 186 | 		return 0;
 187 | 
 188 | 	while ((p = strsep(&options, ",")) != NULL) {
 189 | 		int token;
 190 | 		if (!*p)
 191 | 			continue;
 192 | 
 193 | 		token = match_token(p, tokens, args);
 194 | 		switch (token) {
 195 | 		case Opt_bpi:
 196 | 			if (remount)
 197 | 				goto bad_opt;
 198 | 			if (match_int(&args[0], &option))
 199 | 				goto bad_val;
 200 | 			sbi->bpi = option;
 201 | 			break;
 202 | 		case Opt_uid:
 203 | 			if (remount)
 204 | 				goto bad_opt;
 205 | 			if (match_int(&args[0], &option))
 206 | 				goto bad_val;
 207 | 			sbi->uid = make_kuid(current_user_ns(), option);
 208 | 			break;
 209 | 		case Opt_gid:
 210 | 			if (match_int(&args[0], &option))
 211 | 				goto bad_val;
 212 | 			sbi->gid = make_kgid(current_user_ns(), option);
 213 | 			break;
 214 | 		case Opt_mode:
 215 | 			if (match_octal(&args[0], &option))
 216 | 				goto bad_val;
 217 | 			sbi->mode = option & 01777U;
 218 | 			break;
 219 | 		case Opt_init:
 220 | 			if (remount)
 221 | 				goto bad_opt;
 222 | 			set_opt(sbi->s_mount_opt, FORMAT);
 223 | 			break;
 224 | 		case Opt_jsize:
 225 | 			if (remount)
 226 | 				goto bad_opt;
 227 | 			/* memparse() will accept a K/M/G without a digit */
 228 | 			if (!isdigit(*args[0].from))
 229 | 				goto bad_val;
 230 | 			sbi->jsize = memparse(args[0].from, &rest);
 231 | 			/* make sure journal size is integer power of 2 */
 232 | 			if (sbi->jsize & (sbi->jsize - 1) ||
 233 | 				sbi->jsize < PMFS_MINIMUM_JOURNAL_SIZE) {
 234 | 				pmfs_dbg("Invalid jsize: "
 235 | 					"must be whole power of 2 & >= 64KB\n");
 236 | 				goto bad_val;
 237 | 			}
 238 | 			break;
 239 | 		case Opt_num_inodes:
 240 | 			if (remount)
 241 | 				goto bad_opt;
 242 | 			if (match_int(&args[0], &option))
 243 | 				goto bad_val;
 244 | 			sbi->num_inodes = option;
 245 | 			break;
 246 | 		case Opt_err_panic:
 247 | 			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
 248 | 			clear_opt(sbi->s_mount_opt, ERRORS_RO);
 249 | 			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
 250 | 			break;
 251 | 		case Opt_err_ro:
 252 | 			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
 253 | 			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
 254 | 			set_opt(sbi->s_mount_opt, ERRORS_RO);
 255 | 			break;
 256 | 		case Opt_err_cont:
 257 | 			clear_opt(sbi->s_mount_opt, ERRORS_RO);
 258 | 			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
 259 | 			set_opt(sbi->s_mount_opt, ERRORS_CONT);
 260 | 			break;
 261 | 		case Opt_wprotect:
 262 | 			if (remount)
 263 | 				goto bad_opt;
 264 | 			set_opt(sbi->s_mount_opt, PROTECT);
 265 | 			pmfs_info
 266 | 				("PMFS: Enabling new Write Protection (CR0.WP)\n");
 267 | 			break;
 268 | 		case Opt_wprotectold:
 269 | 			if (remount)
 270 | 				goto bad_opt;
 271 | 			set_opt(sbi->s_mount_opt, PROTECT_OLD);
 272 | 			pmfs_info
 273 | 				("PMFS: Enabling old Write Protection (PAGE RW Bit)\n");
 274 | 			break;
 275 | 		case Opt_hugemmap:
 276 | 			if (remount)
 277 | 				goto bad_opt;
 278 | 			set_opt(sbi->s_mount_opt, HUGEMMAP);
 279 | 			pmfs_info("PMFS: Enabling huge mappings for mmap\n");
 280 | 			break;
 281 | 		case Opt_nohugeioremap:
 282 | 			if (remount)
 283 | 				goto bad_opt;
 284 | 			clear_opt(sbi->s_mount_opt, HUGEIOREMAP);
 285 | 			pmfs_info("PMFS: Disabling huge ioremap\n");
 286 | 			break;
 287 | 		case Opt_dbgmask:
 288 | 			if (match_int(&args[0], &option))
 289 | 				goto bad_val;
 290 | 			pmfs_dbgmask = option;
 291 | 			break;
 292 | 		default: {
 293 | 			goto bad_opt;
 294 | 		}
 295 | 		}
 296 | 	}
 297 | 
 298 | 	return 0;
 299 | 
 300 | bad_val:
 301 | 	printk(KERN_INFO "Bad value '%s' for mount option '%s'\n", args[0].from,
 302 | 	       p);
 303 | 	return -EINVAL;
 304 | bad_opt:
 305 | 	printk(KERN_INFO "Bad mount option: \"%s\"\n", p);
 306 | 	return -EINVAL;
 307 | }
 308 | 
 309 | static bool pmfs_check_size (struct super_block *sb, unsigned long size)
 310 | {
 311 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 312 | 	unsigned long minimum_size, num_blocks;
 313 | 
 314 | 	/* space required for super block and root directory */
 315 | 	minimum_size = 2 << sb->s_blocksize_bits;
 316 | 
 317 | 	/* space required for inode table */
 318 | 	if (sbi->num_inodes > 0)
 319 | 		num_blocks = (sbi->num_inodes >>
 320 | 			(sb->s_blocksize_bits - PMFS_INODE_BITS)) + 1;
 321 | 	else
 322 | 		num_blocks = 1;
 323 | 	minimum_size += (num_blocks << sb->s_blocksize_bits);
 324 | 	/* space required for journal */
 325 | 	minimum_size += sbi->jsize;
 326 | 
 327 | 	if (size < minimum_size)
 328 | 	    return false;
 329 | 
 330 | 	return true;
 331 | }
 332 | 
 333 | 
 334 | static struct pmfs_inode *pmfs_init(struct super_block *sb,
 335 | 				      unsigned long size)
 336 | {
 337 | 	unsigned long blocksize;
 338 | 	u64 journal_meta_start, journal_data_start, inode_table_start;
 339 | 	struct pmfs_inode *root_i;
 340 | 	struct pmfs_super_block *super;
 341 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 342 | 	struct pmfs_direntry *de;
 343 | 	unsigned long blocknr;
 344 | 
 345 | 	pmfs_info("creating an empty pmfs of size %lu\n", size);
 346 | 	sbi->block_start = (unsigned long)0;
 347 | 	sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT);
 348 | 	sbi->num_free_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
 349 | 
 350 | 	if (!sbi->virt_addr) {
 351 | 		printk(KERN_ERR "ioremap of the pmfs image failed(1)\n");
 352 | 		return ERR_PTR(-EINVAL);
 353 | 	}
 354 | #ifdef CONFIG_PMFS_TEST
 355 | 	if (!first_pmfs_super)
 356 | 		first_pmfs_super = sbi->virt_addr;
 357 | #endif
 358 | 
 359 | 	pmfs_dbg_verbose("pmfs: Default block size set to 4K\n");
 360 | 	blocksize = sbi->blocksize = PMFS_DEF_BLOCK_SIZE_4K;
 361 | 
 362 | 	pmfs_set_blocksize(sb, blocksize);
 363 | 	blocksize = sb->s_blocksize;
 364 | 
 365 | 	if (sbi->blocksize && sbi->blocksize != blocksize)
 366 | 		sbi->blocksize = blocksize;
 367 | 
 368 | 	if (!pmfs_check_size(sb, size)) {
 369 | 		pmfs_dbg("Specified PMFS size too small 0x%lx. Either increase"
 370 | 			" PMFS size, or reduce num. of inodes (minimum 32)" 
 371 | 			" or journal size (minimum 64KB)\n", size);
 372 | 		return ERR_PTR(-EINVAL);
 373 | 	}
 374 | 
 375 | 	journal_meta_start = sizeof(struct pmfs_super_block);
 376 | 	journal_meta_start = (journal_meta_start + CACHELINE_SIZE - 1) &
 377 | 		~(CACHELINE_SIZE - 1);
 378 | 	inode_table_start = journal_meta_start + sizeof(pmfs_journal_t);
 379 | 	inode_table_start = (inode_table_start + CACHELINE_SIZE - 1) &
 380 | 		~(CACHELINE_SIZE - 1);
 381 | 
 382 | 	if ((inode_table_start + sizeof(struct pmfs_inode)) > PMFS_SB_SIZE) {
 383 | 		pmfs_dbg("PMFS super block defined too small. defined 0x%x, "
 384 | 				"required 0x%llx\n", PMFS_SB_SIZE,
 385 | 			inode_table_start + sizeof(struct pmfs_inode));
 386 | 		return ERR_PTR(-EINVAL);
 387 | 	}
 388 | 
 389 | 	journal_data_start = PMFS_SB_SIZE * 2;
 390 | 	journal_data_start = (journal_data_start + blocksize - 1) &
 391 | 		~(blocksize - 1);
 392 | 
 393 | 	pmfs_dbg_verbose("journal meta start %llx data start 0x%llx, "
 394 | 		"journal size 0x%x, inode_table 0x%llx\n", journal_meta_start,
 395 | 		journal_data_start, sbi->jsize, inode_table_start);
 396 | 	pmfs_dbg_verbose("max file name len %d\n", (unsigned int)PMFS_NAME_LEN);
 397 | 
 398 | 	super = pmfs_get_super(sb);
 399 | 	pmfs_memunlock_range(sb, super, journal_data_start);
 400 | 
 401 | 	/* clear out super-block and inode table */
 402 | 	memset_nt(super, 0, journal_data_start);
 403 | 	super->s_size = cpu_to_le64(size);
 404 | 	super->s_blocksize = cpu_to_le32(blocksize);
 405 | 	super->s_magic = cpu_to_le16(PMFS_SUPER_MAGIC);
 406 | 	super->s_journal_offset = cpu_to_le64(journal_meta_start);
 407 | 	super->s_inode_table_offset = cpu_to_le64(inode_table_start);
 408 | 
 409 | 	pmfs_init_blockmap(sb, journal_data_start + sbi->jsize);
 410 | 	pmfs_memlock_range(sb, super, journal_data_start);
 411 | 
 412 | 	if (pmfs_journal_hard_init(sb, journal_data_start, sbi->jsize) < 0) {
 413 | 		printk(KERN_ERR "Journal hard initialization failed\n");
 414 | 		return ERR_PTR(-EINVAL);
 415 | 	}
 416 | 
 417 | 	if (pmfs_init_inode_table(sb) < 0)
 418 | 		return ERR_PTR(-EINVAL);
 419 | 
 420 | 	pmfs_memunlock_range(sb, super, PMFS_SB_SIZE*2);
 421 | 	pmfs_sync_super(super);
 422 | 	pmfs_memlock_range(sb, super, PMFS_SB_SIZE*2);
 423 | 
 424 | 	pmfs_flush_buffer(super, PMFS_SB_SIZE, false);
 425 | 	pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, sizeof(*super), false);
 426 | 
 427 | 	pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
 428 | 
 429 | 	root_i = pmfs_get_inode(sb, PMFS_ROOT_INO);
 430 | 
 431 | 	pmfs_memunlock_inode(sb, root_i);
 432 | 	root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR);
 433 | 	root_i->i_uid = cpu_to_le32(from_kuid(&init_user_ns, sbi->uid));
 434 | 	root_i->i_gid = cpu_to_le32(from_kgid(&init_user_ns, sbi->gid));
 435 | 	root_i->i_links_count = cpu_to_le16(2);
 436 | 	root_i->i_blk_type = PMFS_BLOCK_TYPE_4K;
 437 | 	root_i->i_flags = 0;
 438 | 	root_i->i_blocks = cpu_to_le64(1);
 439 | 	root_i->i_size = cpu_to_le64(sb->s_blocksize);
 440 | 	root_i->i_atime = root_i->i_mtime = root_i->i_ctime =
 441 | 		cpu_to_le32(get_seconds());
 442 | 	root_i->root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
 443 | 						       PMFS_BLOCK_TYPE_4K));
 444 | 	root_i->height = 0;
 445 | 	/* pmfs_sync_inode(root_i); */
 446 | 	pmfs_memlock_inode(sb, root_i);
 447 | 	pmfs_flush_buffer(root_i, sizeof(*root_i), false);
 448 | 	de = (struct pmfs_direntry *)
 449 | 		pmfs_get_block(sb, pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K));
 450 | 
 451 | 	pmfs_memunlock_range(sb, de, sb->s_blocksize);
 452 | 	de->ino = cpu_to_le64(PMFS_ROOT_INO);
 453 | 	de->name_len = 1;
 454 | 	de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
 455 | 	strcpy(de->name, ".");
 456 | 	de = (struct pmfs_direntry *)((char *)de + le16_to_cpu(de->de_len));
 457 | 	de->ino = cpu_to_le64(PMFS_ROOT_INO);
 458 | 	de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
 459 | 	de->name_len = 2;
 460 | 	strcpy(de->name, "..");
 461 | 	pmfs_memlock_range(sb, de, sb->s_blocksize);
 462 | 	pmfs_flush_buffer(de, PMFS_DIR_REC_LEN(2), false);
 463 | 	PERSISTENT_MARK();
 464 | 	PERSISTENT_BARRIER();
 465 | 	return root_i;
 466 | }
 467 | 
 468 | static inline void set_default_opts(struct pmfs_sb_info *sbi)
 469 | {
 470 | 	/* set_opt(sbi->s_mount_opt, PROTECT); */
 471 | 	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
 472 | 	set_opt(sbi->s_mount_opt, ERRORS_CONT);
 473 | 	sbi->jsize = PMFS_DEFAULT_JOURNAL_SIZE;
 474 | }
 475 | 
 476 | static void pmfs_root_check(struct super_block *sb, struct pmfs_inode *root_pi)
 477 | {
 478 | /*
 479 |  *      if (root_pi->i_d.d_next) {
 480 |  *              pmfs_warn("root->next not NULL, trying to fix\n");
 481 |  *              goto fail1;
 482 |  *      }
 483 |  */
 484 | 	if (!S_ISDIR(le16_to_cpu(root_pi->i_mode)))
 485 | 		pmfs_warn("root is not a directory!\n");
 486 | #if 0
 487 | 	if (pmfs_calc_checksum((u8 *)root_pi, PMFS_INODE_SIZE)) {
 488 | 		pmfs_dbg("checksum error in root inode, trying to fix\n");
 489 | 		goto fail3;
 490 | 	}
 491 | #endif
 492 | }
 493 | 
 494 | int pmfs_check_integrity(struct super_block *sb,
 495 | 			  struct pmfs_super_block *super)
 496 | {
 497 | 	struct pmfs_super_block *super_redund;
 498 | 
 499 | 	super_redund =
 500 | 		(struct pmfs_super_block *)((char *)super + PMFS_SB_SIZE);
 501 | 
 502 | 	/* Do sanity checks on the superblock */
 503 | 	if (le16_to_cpu(super->s_magic) != PMFS_SUPER_MAGIC) {
 504 | 		if (le16_to_cpu(super_redund->s_magic) != PMFS_SUPER_MAGIC) {
 505 | 			printk(KERN_ERR "Can't find a valid pmfs partition\n");
 506 | 			goto out;
 507 | 		} else {
 508 | 			pmfs_warn
 509 | 				("Error in super block: try to repair it with "
 510 | 				"the redundant copy");
 511 | 			/* Try to auto-recover the super block */
 512 | 			if (sb)
 513 | 				pmfs_memunlock_super(sb, super);
 514 | 			memcpy(super, super_redund,
 515 | 				sizeof(struct pmfs_super_block));
 516 | 			if (sb)
 517 | 				pmfs_memlock_super(sb, super);
 518 | 			pmfs_flush_buffer(super, sizeof(*super), false);
 519 | 			pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
 520 | 				sizeof(*super), false);
 521 | 
 522 | 		}
 523 | 	}
 524 | 
 525 | 	/* Read the superblock */
 526 | 	if (pmfs_calc_checksum((u8 *)super, PMFS_SB_STATIC_SIZE(super))) {
 527 | 		if (pmfs_calc_checksum((u8 *)super_redund,
 528 | 					PMFS_SB_STATIC_SIZE(super_redund))) {
 529 | 			printk(KERN_ERR "checksum error in super block\n");
 530 | 			goto out;
 531 | 		} else {
 532 | 			pmfs_warn
 533 | 				("Error in super block: try to repair it with "
 534 | 				"the redundant copy");
 535 | 			/* Try to auto-recover the super block */
 536 | 			if (sb)
 537 | 				pmfs_memunlock_super(sb, super);
 538 | 			memcpy(super, super_redund,
 539 | 				sizeof(struct pmfs_super_block));
 540 | 			if (sb)
 541 | 				pmfs_memlock_super(sb, super);
 542 | 			pmfs_flush_buffer(super, sizeof(*super), false);
 543 | 			pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
 544 | 				sizeof(*super), false);
 545 | 		}
 546 | 	}
 547 | 
 548 | 	return 1;
 549 | out:
 550 | 	return 0;
 551 | }
 552 | 
 553 | static void pmfs_recover_truncate_list(struct super_block *sb)
 554 | {
 555 | 	struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
 556 | 	u64 ino_next = le64_to_cpu(head->i_next_truncate);
 557 | 	struct pmfs_inode *pi;
 558 | 	struct pmfs_inode_truncate_item *li;
 559 | 	struct inode *inode;
 560 | 
 561 | 	if (ino_next == 0)
 562 | 		return;
 563 | 
 564 | 	while (ino_next != 0) {
 565 | 		pi = pmfs_get_inode(sb, ino_next);
 566 | 		li = (struct pmfs_inode_truncate_item *)(pi + 1);
 567 | 		inode = pmfs_iget(sb, ino_next);
 568 | 		if (IS_ERR(inode))
 569 | 			break;
 570 | 		pmfs_dbg("Recover ino %llx nlink %d sz %llx:%llx\n", ino_next,
 571 | 			inode->i_nlink, pi->i_size, li->i_truncatesize);
 572 | 		if (inode->i_nlink) {
 573 | 			/* set allocation hint */
 574 | 			pmfs_set_blocksize_hint(sb, pi, 
 575 | 					le64_to_cpu(li->i_truncatesize));
 576 | 			pmfs_setsize(inode, le64_to_cpu(li->i_truncatesize));
 577 | 			pmfs_update_isize(inode, pi);
 578 | 		} else {
 579 | 			/* free the inode */
 580 | 			pmfs_dbg("deleting unreferenced inode %lx\n",
 581 | 				inode->i_ino);
 582 | 		}
 583 | 		iput(inode);
 584 | 		pmfs_flush_buffer(pi, CACHELINE_SIZE, false);
 585 | 		ino_next = le64_to_cpu(li->i_next_truncate);
 586 | 	}
 587 | 	PERSISTENT_MARK();
 588 | 	PERSISTENT_BARRIER();
 589 | 	/* reset the truncate_list */
 590 | 	pmfs_memunlock_range(sb, head, sizeof(*head));
 591 | 	head->i_next_truncate = 0;
 592 | 	pmfs_memlock_range(sb, head, sizeof(*head));
 593 | 	pmfs_flush_buffer(head, sizeof(*head), false);
 594 | 	PERSISTENT_MARK();
 595 | 	PERSISTENT_BARRIER();
 596 | }
 597 | 
 598 | static int pmfs_fill_super(struct super_block *sb, void *data, int silent)
 599 | {
 600 | 	struct pmfs_super_block *super;
 601 | 	struct pmfs_inode *root_pi;
 602 | 	struct pmfs_sb_info *sbi = NULL;
 603 | 	struct inode *root_i = NULL;
 604 | 	unsigned long blocksize;
 605 | 	u32 random = 0;
 606 | 	int retval = -EINVAL;
 607 | 
 608 | 	BUILD_BUG_ON(sizeof(struct pmfs_super_block) > PMFS_SB_SIZE);
 609 | 	BUILD_BUG_ON(sizeof(struct pmfs_inode) > PMFS_INODE_SIZE);
 610 | 
 611 | 	if (arch_has_pcommit()) {
 612 | 		pmfs_info("arch has PCOMMIT support\n");
 613 | 		support_pcommit = 1;
 614 | 	} else {
 615 | 		pmfs_info("arch does not have PCOMMIT support\n");
 616 | 	}
 617 | 
 618 | 	if (arch_has_clwb()) {
 619 | 		pmfs_info("arch has CLWB support\n");
 620 | 		support_clwb = 1;
 621 | 	} else {
 622 | 		pmfs_info("arch does not have CLWB support\n");
 623 | 	}
 624 | 
 625 | 	sbi = kzalloc(sizeof(struct pmfs_sb_info), GFP_KERNEL);
 626 | 	if (!sbi)
 627 | 		return -ENOMEM;
 628 | 	sb->s_fs_info = sbi;
 629 | 
 630 | 	set_default_opts(sbi);
 631 | 
 632 | 	if (pmfs_get_block_info(sb, sbi))
 633 | 		goto out;
 634 | 
 635 | 	get_random_bytes(&random, sizeof(u32));
 636 | 	atomic_set(&sbi->next_generation, random);
 637 | 
 638 | 	/* Init with default values */
 639 | 	INIT_LIST_HEAD(&sbi->block_inuse_head);
 640 | 	sbi->mode = (S_IRUGO | S_IXUGO | S_IWUSR);
 641 | 	sbi->uid = current_fsuid();
 642 | 	sbi->gid = current_fsgid();
 643 | 	set_opt(sbi->s_mount_opt, XIP);
 644 | 	clear_opt(sbi->s_mount_opt, PROTECT);
 645 | 	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
 646 | 
 647 | 	INIT_LIST_HEAD(&sbi->s_truncate);
 648 | 	mutex_init(&sbi->s_truncate_lock);
 649 | 	mutex_init(&sbi->inode_table_mutex);
 650 | 	mutex_init(&sbi->s_lock);
 651 | 
 652 | 	if (pmfs_parse_options(data, sbi, 0))
 653 | 		goto out;
 654 | 
 655 | 	set_opt(sbi->s_mount_opt, MOUNTING);
 656 | 
 657 | 	/* Init a new pmfs instance */
 658 | 	if (sbi->s_mount_opt & PMFS_MOUNT_FORMAT) {
 659 | 		root_pi = pmfs_init(sb, sbi->initsize);
 660 | 		if (IS_ERR(root_pi))
 661 | 			goto out;
 662 | 		super = pmfs_get_super(sb);
 663 | 		goto setup_sb;
 664 | 	}
 665 | 	pmfs_dbg_verbose("checking physical address 0x%016llx for pmfs image\n",
 666 | 		  (u64)sbi->phys_addr);
 667 | 
 668 | 	super = pmfs_get_super(sb);
 669 | 
 670 | 	if (pmfs_journal_soft_init(sb)) {
 671 | 		retval = -EINVAL;
 672 | 		printk(KERN_ERR "Journal initialization failed\n");
 673 | 		goto out;
 674 | 	}
 675 | 	if (pmfs_recover_journal(sb)) {
 676 | 		retval = -EINVAL;
 677 | 		printk(KERN_ERR "Journal recovery failed\n");
 678 | 		goto out;
 679 | 	}
 680 | 
 681 | 	if (pmfs_check_integrity(sb, super) == 0) {
 682 | 		pmfs_dbg("Memory contains invalid pmfs %x:%x\n",
 683 | 				le16_to_cpu(super->s_magic), PMFS_SUPER_MAGIC);
 684 | 		goto out;
 685 | 	}
 686 | 
 687 | 	blocksize = le32_to_cpu(super->s_blocksize);
 688 | 	pmfs_set_blocksize(sb, blocksize);
 689 | 
 690 | 	pmfs_dbg_verbose("blocksize %lu\n", blocksize);
 691 | 
 692 | 	/* Read the root inode */
 693 | 	root_pi = pmfs_get_inode(sb, PMFS_ROOT_INO);
 694 | 
 695 | 	/* Check that the root inode is in a sane state */
 696 | 	pmfs_root_check(sb, root_pi);
 697 | 
 698 | #ifdef CONFIG_PMFS_TEST
 699 | 	if (!first_pmfs_super)
 700 | 		first_pmfs_super = sbi->virt_addr;
 701 | #endif
 702 | 
 703 | 	/* Set it all up.. */
 704 | setup_sb:
 705 | 	sb->s_magic = le16_to_cpu(super->s_magic);
 706 | 	sb->s_op = &pmfs_sops;
 707 | 	sb->s_maxbytes = pmfs_max_size(sb->s_blocksize_bits);
 708 | 	sb->s_time_gran = 1;
 709 | 	sb->s_export_op = &pmfs_export_ops;
 710 | 	sb->s_xattr = NULL;
 711 | 	sb->s_flags |= MS_NOSEC;
 712 | 	root_i = pmfs_iget(sb, PMFS_ROOT_INO);
 713 | 	if (IS_ERR(root_i)) {
 714 | 		retval = PTR_ERR(root_i);
 715 | 		goto out;
 716 | 	}
 717 | 
 718 | 	sb->s_root = d_make_root(root_i);
 719 | 	if (!sb->s_root) {
 720 | 		printk(KERN_ERR "get pmfs root inode failed\n");
 721 | 		retval = -ENOMEM;
 722 | 		goto out;
 723 | 	}
 724 | 
 725 | 	pmfs_recover_truncate_list(sb);
 726 | 	/* If the FS was not formatted on this mount, scan the meta-data after
 727 | 	 * truncate list has been processed */
 728 | 	if ((sbi->s_mount_opt & PMFS_MOUNT_FORMAT) == 0)
 729 | 		pmfs_setup_blocknode_map(sb);
 730 | 
 731 | 	if (!(sb->s_flags & MS_RDONLY)) {
 732 | 		u64 mnt_write_time;
 733 | 		/* update mount time and write time atomically. */
 734 | 		mnt_write_time = (get_seconds() & 0xFFFFFFFF);
 735 | 		mnt_write_time = mnt_write_time | (mnt_write_time << 32);
 736 | 
 737 | 		pmfs_memunlock_range(sb, &super->s_mtime, 8);
 738 | 		pmfs_memcpy_atomic(&super->s_mtime, &mnt_write_time, 8);
 739 | 		pmfs_memlock_range(sb, &super->s_mtime, 8);
 740 | 
 741 | 		pmfs_flush_buffer(&super->s_mtime, 8, false);
 742 | 		PERSISTENT_MARK();
 743 | 		PERSISTENT_BARRIER();
 744 | 	}
 745 | 
 746 | 	clear_opt(sbi->s_mount_opt, MOUNTING);
 747 | 	retval = 0;
 748 | 	return retval;
 749 | out:
 750 | 	kfree(sbi);
 751 | 	return retval;
 752 | }
 753 | 
 754 | int pmfs_statfs(struct dentry *d, struct kstatfs *buf)
 755 | {
 756 | 	struct super_block *sb = d->d_sb;
 757 | 	unsigned long count = 0;
 758 | 	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
 759 | 
 760 | 	buf->f_type = PMFS_SUPER_MAGIC;
 761 | 	buf->f_bsize = sb->s_blocksize;
 762 | 
 763 | 	count = sbi->block_end;
 764 | 	buf->f_blocks = sbi->block_end;
 765 | 	buf->f_bfree = buf->f_bavail = pmfs_count_free_blocks(sb);
 766 | 	buf->f_files = (sbi->s_inodes_count);
 767 | 	buf->f_ffree = (sbi->s_free_inodes_count);
 768 | 	buf->f_namelen = PMFS_NAME_LEN;
 769 | 	pmfs_dbg_verbose("pmfs_stats: total 4k free blocks 0x%llx\n",
 770 | 		buf->f_bfree);
 771 | 	pmfs_dbg_verbose("total inodes 0x%x, free inodes 0x%x, "
 772 | 		"blocknodes 0x%lx\n", (sbi->s_inodes_count),
 773 | 		(sbi->s_free_inodes_count), (sbi->num_blocknode_allocated));
 774 | 	return 0;
 775 | }
 776 | 
 777 | static int pmfs_show_options(struct seq_file *seq, struct dentry *root)
 778 | {
 779 | 	struct pmfs_sb_info *sbi = PMFS_SB(root->d_sb);
 780 | 
 781 | 	seq_printf(seq, ",physaddr=0x%016llx", (u64)sbi->phys_addr);
 782 | 	if (sbi->initsize)
 783 | 		seq_printf(seq, ",init=%luk", sbi->initsize >> 10);
 784 | 	if (sbi->blocksize)
 785 | 		seq_printf(seq, ",bs=%lu", sbi->blocksize);
 786 | 	if (sbi->bpi)
 787 | 		seq_printf(seq, ",bpi=%lu", sbi->bpi);
 788 | 	if (sbi->num_inodes)
 789 | 		seq_printf(seq, ",N=%lu", sbi->num_inodes);
 790 | 	if (sbi->mode != (S_IRWXUGO | S_ISVTX))
 791 | 		seq_printf(seq, ",mode=%03o", sbi->mode);
 792 | 	if (uid_valid(sbi->uid))
 793 | 		seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->uid));
 794 | 	if (gid_valid(sbi->gid))
 795 | 		seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->gid));
 796 | 	if (test_opt(root->d_sb, ERRORS_RO))
 797 | 		seq_puts(seq, ",errors=remount-ro");
 798 | 	if (test_opt(root->d_sb, ERRORS_PANIC))
 799 | 		seq_puts(seq, ",errors=panic");
 800 | 	/* memory protection disabled by default */
 801 | 	if (test_opt(root->d_sb, PROTECT))
 802 | 		seq_puts(seq, ",wprotect");
 803 | 	if (test_opt(root->d_sb, HUGEMMAP))
 804 | 		seq_puts(seq, ",hugemmap");
 805 | 	if (test_opt(root->d_sb, HUGEIOREMAP))
 806 | 		seq_puts(seq, ",hugeioremap");
 807 | 	/* xip not enabled by default */
 808 | 	if (test_opt(root->d_sb, XIP))
 809 | 		seq_puts(seq, ",xip");
 810 | 
 811 | 	return 0;
 812 | }
 813 | 
 814 | int pmfs_remount(struct super_block *sb, int *mntflags, char *data)
 815 | {
 816 | 	unsigned long old_sb_flags;
 817 | 	unsigned long old_mount_opt;
 818 | 	struct pmfs_super_block *ps;
 819 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 820 | 	int ret = -EINVAL;
 821 | 
 822 | 	/* Store the old options */
 823 | 	mutex_lock(&sbi->s_lock);
 824 | 	old_sb_flags = sb->s_flags;
 825 | 	old_mount_opt = sbi->s_mount_opt;
 826 | 
 827 | 	if (pmfs_parse_options(data, sbi, 1))
 828 | 		goto restore_opt;
 829 | 
 830 | 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 831 | 		      ((sbi->s_mount_opt & PMFS_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 832 | 
 833 | 	if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 834 | 		u64 mnt_write_time;
 835 | 		ps = pmfs_get_super(sb);
 836 | 		/* update mount time and write time atomically. */
 837 | 		mnt_write_time = (get_seconds() & 0xFFFFFFFF);
 838 | 		mnt_write_time = mnt_write_time | (mnt_write_time << 32);
 839 | 
 840 | 		pmfs_memunlock_range(sb, &ps->s_mtime, 8);
 841 | 		pmfs_memcpy_atomic(&ps->s_mtime, &mnt_write_time, 8);
 842 | 		pmfs_memlock_range(sb, &ps->s_mtime, 8);
 843 | 
 844 | 		pmfs_flush_buffer(&ps->s_mtime, 8, false);
 845 | 		PERSISTENT_MARK();
 846 | 		PERSISTENT_BARRIER();
 847 | 	}
 848 | 
 849 | 	mutex_unlock(&sbi->s_lock);
 850 | 	ret = 0;
 851 | 	return ret;
 852 | 
 853 | restore_opt:
 854 | 	sb->s_flags = old_sb_flags;
 855 | 	sbi->s_mount_opt = old_mount_opt;
 856 | 	mutex_unlock(&sbi->s_lock);
 857 | 	return ret;
 858 | }
 859 | 
 860 | static void pmfs_put_super(struct super_block *sb)
 861 | {
 862 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 863 | 	struct pmfs_blocknode *i;
 864 | 	struct list_head *head = &(sbi->block_inuse_head);
 865 | 
 866 | #ifdef CONFIG_PMFS_TEST
 867 | 	if (first_pmfs_super == sbi->virt_addr)
 868 | 		first_pmfs_super = NULL;
 869 | #endif
 870 | 
 871 | 	/* It's unmount time, so unmap the pmfs memory */
 872 | 	if (sbi->virt_addr) {
 873 | 		pmfs_save_blocknode_mappings(sb);
 874 | 		pmfs_journal_uninit(sb);
 875 | 		sbi->virt_addr = NULL;
 876 | 	}
 877 | 
 878 | 	/* Free all the pmfs_blocknodes */
 879 | 	while (!list_empty(head)) {
 880 | 		i = list_first_entry(head, struct pmfs_blocknode, link);
 881 | 		list_del(&i->link);
 882 | 		pmfs_free_blocknode(sb, i);
 883 | 	}
 884 | 	sb->s_fs_info = NULL;
 885 | 	pmfs_dbgmask = 0;
 886 | 	kfree(sbi);
 887 | }
 888 | 
 889 | inline void pmfs_free_transaction(pmfs_transaction_t *trans)
 890 | {
 891 | 	kmem_cache_free(pmfs_transaction_cachep, trans);
 892 | }
 893 | 
 894 | void __pmfs_free_blocknode(struct pmfs_blocknode *bnode)
 895 | {
 896 | 	kmem_cache_free(pmfs_blocknode_cachep, bnode);
 897 | }
 898 | 
 899 | void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode)
 900 | {
 901 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 902 | 	sbi->num_blocknode_allocated--;
 903 | 	__pmfs_free_blocknode(bnode);
 904 | }
 905 | 
 906 | inline pmfs_transaction_t *pmfs_alloc_transaction(void)
 907 | {
 908 | 	return (pmfs_transaction_t *)
 909 | 		kmem_cache_alloc(pmfs_transaction_cachep, GFP_NOFS);
 910 | }
 911 | 
 912 | struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb)
 913 | {
 914 | 	struct pmfs_blocknode *p;
 915 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
 916 | 	p = (struct pmfs_blocknode *)
 917 | 		kmem_cache_alloc(pmfs_blocknode_cachep, GFP_NOFS);
 918 | 	if (p) {
 919 | 		sbi->num_blocknode_allocated++;
 920 | 	}
 921 | 	return p;
 922 | }
 923 | 
 924 | static struct inode *pmfs_alloc_inode(struct super_block *sb)
 925 | {
 926 | 	struct pmfs_inode_info *vi;
 927 | 
 928 | 	vi = kmem_cache_alloc(pmfs_inode_cachep, GFP_NOFS);
 929 | 	if (!vi)
 930 | 		return NULL;
 931 | 
 932 | //	vi->vfs_inode.i_version = 1;
 933 | 	return &vi->vfs_inode;
 934 | }
 935 | 
 936 | static void pmfs_i_callback(struct rcu_head *head)
 937 | {
 938 | 	struct inode *inode = container_of(head, struct inode, i_rcu);
 939 | 
 940 | 	kmem_cache_free(pmfs_inode_cachep, PMFS_I(inode));
 941 | }
 942 | 
 943 | static void pmfs_destroy_inode(struct inode *inode)
 944 | {
 945 | 	call_rcu(&inode->i_rcu, pmfs_i_callback);
 946 | }
 947 | 
 948 | static void init_once(void *foo)
 949 | {
 950 | 	struct pmfs_inode_info *vi = foo;
 951 | 
 952 | 	vi->i_dir_start_lookup = 0;
 953 | 	INIT_LIST_HEAD(&vi->i_truncated);
 954 | 	inode_init_once(&vi->vfs_inode);
 955 | }
 956 | 
 957 | 
 958 | static int __init init_blocknode_cache(void)
 959 | {
 960 | 	pmfs_blocknode_cachep = kmem_cache_create("pmfs_blocknode_cache",
 961 | 					sizeof(struct pmfs_blocknode),
 962 | 					0, (SLAB_RECLAIM_ACCOUNT |
 963 |                                         SLAB_MEM_SPREAD), NULL);
 964 | 	if (pmfs_blocknode_cachep == NULL)
 965 | 		return -ENOMEM;
 966 | 	return 0;
 967 | }
 968 | 
 969 | 
 970 | static int __init init_inodecache(void)
 971 | {
 972 | 	pmfs_inode_cachep = kmem_cache_create("pmfs_inode_cache",
 973 | 					       sizeof(struct pmfs_inode_info),
 974 | 					       0, (SLAB_RECLAIM_ACCOUNT |
 975 | 						   SLAB_MEM_SPREAD), init_once);
 976 | 	if (pmfs_inode_cachep == NULL)
 977 | 		return -ENOMEM;
 978 | 	return 0;
 979 | }
 980 | 
 981 | static int __init init_transaction_cache(void)
 982 | {
 983 | 	pmfs_transaction_cachep = kmem_cache_create("pmfs_journal_transaction",
 984 | 			sizeof(pmfs_transaction_t), 0, (SLAB_RECLAIM_ACCOUNT |
 985 | 			SLAB_MEM_SPREAD), NULL);
 986 | 	if (pmfs_transaction_cachep == NULL) {
 987 | 		pmfs_dbg("PMFS: failed to init transaction cache\n");
 988 | 		return -ENOMEM;
 989 | 	}
 990 | 	return 0;
 991 | }
 992 | 
 993 | static void destroy_transaction_cache(void)
 994 | {
 995 | 	if (pmfs_transaction_cachep)
 996 | 		kmem_cache_destroy(pmfs_transaction_cachep);
 997 | 	pmfs_transaction_cachep = NULL;
 998 | }
 999 | 
1000 | static void destroy_inodecache(void)
1001 | {
1002 | 	/*
1003 | 	 * Make sure all delayed rcu free inodes are flushed before
1004 | 	 * we destroy cache.
1005 | 	 */
1006 | 	rcu_barrier();
1007 | 	kmem_cache_destroy(pmfs_inode_cachep);
1008 | }
1009 | 
1010 | static void destroy_blocknode_cache(void)
1011 | {
1012 | 	kmem_cache_destroy(pmfs_blocknode_cachep);
1013 | }
1014 | 
1015 | /*
1016 |  * the super block writes are all done "on the fly", so the
1017 |  * super block is never in a "dirty" state, so there's no need
1018 |  * for write_super.
1019 |  */
1020 | static struct super_operations pmfs_sops = {
1021 | 	.alloc_inode	= pmfs_alloc_inode,
1022 | 	.destroy_inode	= pmfs_destroy_inode,
1023 | 	.write_inode	= pmfs_write_inode,
1024 | 	.dirty_inode	= pmfs_dirty_inode,
1025 | 	.evict_inode	= pmfs_evict_inode,
1026 | 	.put_super	= pmfs_put_super,
1027 | 	.statfs		= pmfs_statfs,
1028 | 	.remount_fs	= pmfs_remount,
1029 | 	.show_options	= pmfs_show_options,
1030 | };
1031 | 
1032 | static struct dentry *pmfs_mount(struct file_system_type *fs_type,
1033 | 				  int flags, const char *dev_name, void *data)
1034 | {
1035 | 	return mount_bdev(fs_type, flags, dev_name, data, pmfs_fill_super);
1036 | }
1037 | 
1038 | static struct file_system_type pmfs_fs_type = {
1039 | 	.owner		= THIS_MODULE,
1040 | 	.name		= "pmfs",
1041 | 	.mount		= pmfs_mount,
1042 | 	.kill_sb	= kill_block_super,
1043 | };
1044 | 
1045 | static struct inode *pmfs_nfs_get_inode(struct super_block *sb,
1046 | 					 u64 ino, u32 generation)
1047 | {
1048 | 	struct pmfs_sb_info *sbi = PMFS_SB(sb);
1049 | 	struct inode *inode;
1050 | 
1051 | 	if (ino < PMFS_ROOT_INO)
1052 | 		return ERR_PTR(-ESTALE);
1053 | 
1054 | 	if ((ino >> PMFS_INODE_BITS) > (sbi->s_inodes_count))
1055 | 		return ERR_PTR(-ESTALE);
1056 | 
1057 | 	inode = pmfs_iget(sb, ino);
1058 | 	if (IS_ERR(inode))
1059 | 		return ERR_CAST(inode);
1060 | 
1061 | 	if (generation && inode->i_generation != generation) {
1062 | 		/* we didn't find the right inode.. */
1063 | 		iput(inode);
1064 | 		return ERR_PTR(-ESTALE);
1065 | 	}
1066 | 
1067 | 	return inode;
1068 | }
1069 | 
1070 | static struct dentry *pmfs_fh_to_dentry(struct super_block *sb,
1071 | 					 struct fid *fid, int fh_len,
1072 | 					 int fh_type)
1073 | {
1074 | 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1075 | 				    pmfs_nfs_get_inode);
1076 | }
1077 | 
1078 | static struct dentry *pmfs_fh_to_parent(struct super_block *sb,
1079 | 					 struct fid *fid, int fh_len,
1080 | 					 int fh_type)
1081 | {
1082 | 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1083 | 				    pmfs_nfs_get_inode);
1084 | }
1085 | 
1086 | static const struct export_operations pmfs_export_ops = {
1087 | 	.fh_to_dentry	= pmfs_fh_to_dentry,
1088 | 	.fh_to_parent	= pmfs_fh_to_parent,
1089 | 	.get_parent	= pmfs_get_parent,
1090 | };
1091 | 
1092 | static int __init init_pmfs_fs(void)
1093 | {
1094 | 	int rc = 0;
1095 | 
1096 | 	rc = init_blocknode_cache();
1097 | 	if (rc)
1098 | 		return rc;
1099 | 
1100 | 	rc = init_transaction_cache();
1101 | 	if (rc)
1102 | 		goto out1;
1103 | 
1104 | 	rc = init_inodecache();
1105 | 	if (rc)
1106 | 		goto out2;
1107 | 
1108 | 	rc = register_filesystem(&pmfs_fs_type);
1109 | 	if (rc)
1110 | 		goto out3;
1111 | 
1112 | 	return 0;
1113 | 
1114 | out3:
1115 | 	destroy_inodecache();
1116 | out2:
1117 | 	destroy_transaction_cache();
1118 | out1:
1119 | 	destroy_blocknode_cache();
1120 | 	return rc;
1121 | }
1122 | 
1123 | static void __exit exit_pmfs_fs(void)
1124 | {
1125 | 	unregister_filesystem(&pmfs_fs_type);
1126 | 	destroy_inodecache();
1127 | 	destroy_blocknode_cache();
1128 | 	destroy_transaction_cache();
1129 | }
1130 | 
1131 | MODULE_AUTHOR("Intel Corporation <linux-pmfs@intel.com>");
1132 | MODULE_DESCRIPTION("Persistent Memory File System");
1133 | MODULE_LICENSE("GPL");
1134 | 
1135 | module_init(init_pmfs_fs)
1136 | module_exit(exit_pmfs_fs)
1137 | 


--------------------------------------------------------------------------------