summaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
commitd0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /Documentation/filesystems
parente914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking36
-rw-r--r--Documentation/filesystems/dax.txt6
-rw-r--r--Documentation/filesystems/f2fs.txt7
-rw-r--r--Documentation/filesystems/nilfs2.txt3
-rw-r--r--Documentation/filesystems/ocfs2-online-filecheck.txt10
-rw-r--r--Documentation/filesystems/orangefs.txt50
-rw-r--r--Documentation/filesystems/overlayfs.txt8
-rw-r--r--Documentation/filesystems/porting7
-rw-r--r--Documentation/filesystems/proc.txt11
-rw-r--r--Documentation/filesystems/tmpfs.txt2
-rw-r--r--Documentation/filesystems/vfs.txt64
11 files changed, 146 insertions, 58 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75eea7ce3..d30fb2cb5 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -12,14 +12,17 @@ prototypes:
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
- int (*d_compare)(const struct dentry *, const struct dentry *,
+ int (*d_compare)(const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(struct dentry *);
+ int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
struct vfsmount *(*d_automount)(struct path *path);
int (*d_manage)(struct dentry *, bool);
+ struct dentry *(*d_real)(struct dentry *, const struct inode *,
+ unsigned int);
locking rules:
rename_lock ->d_lock may block rcu-walk
@@ -28,12 +31,14 @@ d_weak_revalidate:no no yes no
d_hash no no no maybe
d_compare: yes no no maybe
d_delete: no yes no no
+d_init: no no yes no
d_release: no no yes no
d_prune: no yes no no
d_iput: no no yes no
d_dname: no no no no
d_automount: no no yes no
d_manage: no no yes (ref-walk) maybe
+d_real no no yes no
--------------------------- inode_operations ---------------------------
prototypes:
@@ -66,7 +71,6 @@ prototypes:
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
- int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
locking rules:
all may block
@@ -95,7 +99,6 @@ fiemap: no
update_time: no
atomic_open: yes
tmpfile: no
-dentry_open: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
@@ -179,7 +182,6 @@ unlocks and drops the reference.
prototypes:
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
- int (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
@@ -195,7 +197,9 @@ prototypes:
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+ bool (*isolate_page) (struct page *, isolate_mode_t);
int (*migratepage)(struct address_space *, struct page *, struct page *);
+ void (*putback_page) (struct page *);
int (*launder_page)(struct page *);
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
@@ -208,7 +212,6 @@ locking rules:
PageLocked(page) i_mutex
writepage: yes, unlocks (see below)
readpage: yes, unlocks
-sync_page: maybe
writepages:
set_page_dirty no
readpages:
@@ -219,15 +222,17 @@ invalidatepage: yes
releasepage: yes
freepage: yes
direct_IO:
+isolate_page: yes
migratepage: yes (both)
+putback_page: yes
launder_page: yes
is_partially_uptodate: yes
error_remove_page: yes
swap_activate: no
swap_deactivate: no
- ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
-may be called from the request handler (/dev/loop).
+ ->write_begin(), ->write_end() and ->readpage() may be called from
+the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
@@ -283,11 +288,6 @@ will leave the page itself marked clean but it will be tagged as dirty in the
radix tree. This incoherency can lead to all sorts of hard-to-debug problems
in the filesystem like having dirty inodes at umount and losing written data.
- ->sync_page() locking rules are not well-defined - usually it is called
-with lock on page, but that is not guaranteed. Considering the currently
-existing instances of this method ->sync_page() itself doesn't look
-well-defined...
-
->writepages() is used for periodic writeback and for syscall-initiated
sync operations. The address_space should start I/O against at least
*nr_to_write pages. *nr_to_write must be decremented for each page which is
@@ -395,7 +395,7 @@ prototypes:
int (*release) (struct gendisk *, fmode_t);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
- int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+ int (*direct_access) (struct block_device *, sector_t, void **,
unsigned long *);
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
@@ -544,13 +544,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
->map_pages() is called when VM asks to map easy accessible pages.
-Filesystem should find and map pages associated with offsets from "pgoff"
-till "max_pgoff". ->map_pages() is called with page table locked and must
+Filesystem should find and map pages associated with offsets from "start_pgoff"
+till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup
-page table entry. Pointer to entry associated with offset "pgoff" is
-passed in "pte" field in vm_fault structure. Pointers to entries for other
-offsets should be calculated relative to "pte".
+page table entry. Pointer to entry associated with the page is passed in
+"pte" field in fault_env structure. Pointers to entries for other offsets
+should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index ce4587d25..0c16a2252 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -49,6 +49,7 @@ These block devices may be used for inspiration:
- axonram: Axon DDR2 device driver
- brd: RAM backed block device driver
- dcssblk: s390 dcss block device driver
+- pmem: NVDIMM persistent memory driver
Implementation Tips for Filesystem Writers
@@ -75,8 +76,9 @@ calls to get_block() (for example by a page-fault racing with a read()
or a write()) work correctly.
These filesystems may be used for inspiration:
-- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt
-- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
+- ext2: see Documentation/filesystems/ext2.txt
+- ext4: see Documentation/filesystems/ext4.txt
+- xfs: see Documentation/filesystems/xfs.txt
Handling Media Errors
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index e1c9f0849..ecd808088 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -109,7 +109,9 @@ background_gc=%s Turn on/off cleaning operations, namely garbage
disable_roll_forward Disable the roll-forward recovery routine
norecovery Disable the roll-forward recovery routine, mounted read-
only (i.e., -o ro,disable_roll_forward)
-discard Issue discard/TRIM commands when a segment is cleaned.
+discard/nodiscard Enable/disable real-time discard in f2fs, if discard is
+ enabled, f2fs will issue discard/TRIM commands when a
+ segment is cleaned.
no_heap Disable heap-style segment allocation which finds free
segments for data from the beginning of main area, while
for node from the end of main area.
@@ -151,6 +153,9 @@ noinline_data Disable the inline data feature, inline data feature is
enabled by default.
data_flush Enable data flushing before checkpoint in order to
persist data of regular and symlink.
+mode=%s Control block allocation mode which supports "adaptive"
+ and "lfs". In "lfs" mode, there should be no random
+ writes towards main area.
================================================================================
DEBUGFS ENTRIES
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 5b21ef76f..c0727dc36 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -267,7 +267,8 @@ among NILFS2 files can be depicted as follows:
`-- file (ino=yy)
( regular file, directory, or symlink )
-For detail on the format of each file, please see include/linux/nilfs2_fs.h.
+For detail on the format of each file, please see nilfs2_ondisk.h
+located at include/uapi/linux directory.
There are no patents or other intellectual property that we protect
with regard to the design of NILFS2. It is allowed to replicate the
diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt
index 1ab078604..139fab175 100644
--- a/Documentation/filesystems/ocfs2-online-filecheck.txt
+++ b/Documentation/filesystems/ocfs2-online-filecheck.txt
@@ -5,12 +5,12 @@ This document will describe OCFS2 online file check feature.
Introduction
============
-OCFS2 is often used in high-availaibility systems. However, OCFS2 usually
+OCFS2 is often used in high-availability systems. However, OCFS2 usually
converts the filesystem to read-only when encounters an error. This may not be
necessary, since turning the filesystem read-only would affect other running
processes as well, decreasing availability.
Then, a mount option (errors=continue) is introduced, which would return the
--EIO errno to the calling process and terminate furhter processing so that the
+-EIO errno to the calling process and terminate further processing so that the
filesystem is not corrupted further. The filesystem is not converted to
read-only, and the problematic file's inode number is reported in the kernel
log. The user can try to check/fix this file via online filecheck feature.
@@ -44,7 +44,7 @@ There is a sysfs directory for each OCFS2 file system mounting:
/sys/fs/ocfs2/<devname>/filecheck
-Here, <devname> indicates the name of OCFS2 volumn device which has been already
+Here, <devname> indicates the name of OCFS2 volume device which has been already
mounted. The file above would accept inode numbers. This could be used to
communicate with kernel space, tell which file(inode number) will be checked or
fixed. Currently, three operations are supported, which includes checking
@@ -76,14 +76,14 @@ The output is like this:
This time, the <ERROR> column indicates whether this fix is successful or not.
3. The record cache is used to store the history of check/fix results. It's
-defalut size is 10, and can be adjust between the range of 10 ~ 100. You can
+default size is 10, and can be adjust between the range of 10 ~ 100. You can
adjust the size like this:
# echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set
Fixing stuff
============
-On receivng the inode, the filesystem would read the inode and the
+On receiving the inode, the filesystem would read the inode and the
file metadata. In case of errors, the filesystem would fix the errors
and report the problems it fixed in the kernel log. As a precautionary measure,
the inode must first be checked for errors before performing a final fix.
diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt
index e1a0056a3..1dfdec790 100644
--- a/Documentation/filesystems/orangefs.txt
+++ b/Documentation/filesystems/orangefs.txt
@@ -281,7 +281,7 @@ on the wait queue and one attempt is made to recycle them. Obviously,
if the client-core stays dead too long, the arbitrary userspace processes
trying to use Orangefs will be negatively affected. Waiting ops
that can't be serviced will be removed from the request list and
-have their states set to "given up". In-progress ops that can't
+have their states set to "given up". In-progress ops that can't
be serviced will be removed from the in_progress hash table and
have their states set to "given up".
@@ -338,7 +338,7 @@ particular response.
PVFS2_VFS_OP_STATFS
fill a pvfs2_statfs_response_t with useless info <g>. It is hard for
us to know, in a timely fashion, these statistics about our
- distributed network filesystem.
+ distributed network filesystem.
PVFS2_VFS_OP_FS_MOUNT
fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref
@@ -386,7 +386,7 @@ responses:
io_array[1].iov_base = address of global variable "pdev_magic" (int32_t)
io_array[1].iov_len = sizeof(int32_t)
-
+
io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t)
io_array[2].iov_len = sizeof(int64_t)
@@ -402,5 +402,47 @@ Readdir responses initialize the fifth element io_array like this:
io_array[4].iov_len = contents of member trailer_size (PVFS_size)
from out_downcall member of global variable
vfs_request
-
+
+Orangefs exploits the dcache in order to avoid sending redundant
+requests to userspace. We keep object inode attributes up-to-date with
+orangefs_inode_getattr. Orangefs_inode_getattr uses two arguments to
+help it decide whether or not to update an inode: "new" and "bypass".
+Orangefs keeps private data in an object's inode that includes a short
+timeout value, getattr_time, which allows any iteration of
+orangefs_inode_getattr to know how long it has been since the inode was
+updated. When the object is not new (new == 0) and the bypass flag is not
+set (bypass == 0) orangefs_inode_getattr returns without updating the inode
+if getattr_time has not timed out. Getattr_time is updated each time the
+inode is updated.
+
+Creation of a new object (file, dir, sym-link) includes the evaluation of
+its pathname, resulting in a negative directory entry for the object.
+A new inode is allocated and associated with the dentry, turning it from
+a negative dentry into a "productive full member of society". Orangefs
+obtains the new inode from Linux with new_inode() and associates
+the inode with the dentry by sending the pair back to Linux with
+d_instantiate().
+
+The evaluation of a pathname for an object resolves to its corresponding
+dentry. If there is no corresponding dentry, one is created for it in
+the dcache. Whenever a dentry is modified or verified Orangefs stores a
+short timeout value in the dentry's d_time, and the dentry will be trusted
+for that amount of time. Orangefs is a network filesystem, and objects
+can potentially change out-of-band with any particular Orangefs kernel module
+instance, so trusting a dentry is risky. The alternative to trusting
+dentries is to always obtain the needed information from userspace - at
+least a trip to the client-core, maybe to the servers. Obtaining information
+from a dentry is cheap, obtaining it from userspace is relatively expensive,
+hence the motivation to use the dentry when possible.
+
+The timeout values d_time and getattr_time are jiffy based, and the
+code is designed to avoid the jiffy-wrap problem:
+
+"In general, if the clock may have wrapped around more than once, there
+is no way to tell how much time has elapsed. However, if the times t1
+and t2 are known to be fairly close, we can reliably compute the
+difference in a way that takes into account the possibility that the
+clock may have wrapped between times."
+
+ from course notes by instructor Andy Wang
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index d6259c786..bcbf9710e 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -183,12 +183,10 @@ The copy_up operation essentially creates a new, identical file and
moves it over to the old name. The new file may be on a different
filesystem, so both st_dev and st_ino of the file may change.
-Any open files referring to this inode will access the old data and
-metadata. Similarly any file locks obtained before copy_up will not
-apply to the copied up file.
+Any open files referring to this inode will access the old data.
-On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
-fsetxattr(2) will fail with EROFS.
+Any file locks (and leases) obtained before copy_up will not apply
+to the copied up file.
If a file with multiple hard links is copied up, then this will
"break" the link. Changes will not be propagated to other names
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index a5fb89cac..b1bd05ea6 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -585,3 +585,10 @@ in your dentry operations instead.
in the instances. Rationale: !@#!@# security_d_instantiate() needs to be
called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
->d_instantiate() uses not just ->getxattr() but ->setxattr() as well.
+--
+[mandatory]
+ ->d_compare() doesn't get parent as a separate argument anymore. If you
+ used it for finding the struct super_block involved, dentry->d_sb will
+ work just as well; if it's something more complicated, use dentry->d_parent.
+ Just be careful not to assume that fetching it more than once will yield
+ the same value - in RCU mode it could change under you.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index e8d00759b..68080ad6a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -436,6 +436,7 @@ Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
AnonHugePages: 0 kB
+ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
@@ -464,6 +465,8 @@ accessed.
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
+huge pages.
"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
@@ -725,7 +728,7 @@ IRQ, you can set it by doing:
> echo 1 > /proc/irq/10/smp_affinity
This means that only the first CPU will handle the IRQ, but you can also echo
-5 which means that only the first and fourth CPU can handle the IRQ.
+5 which means that only the first and third CPU can handle the IRQ.
The contents of each smp_affinity file is the same by default:
@@ -868,6 +871,9 @@ VmallocTotal: 112216 kB
VmallocUsed: 428 kB
VmallocChunk: 111088 kB
AnonHugePages: 49152 kB
+ShmemHugePages: 0 kB
+ShmemPmdMapped: 0 kB
+
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
@@ -912,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new
AnonHugePages: Non-file backed huge pages mapped into userspace page tables
Mapped: files which have been mmaped, such as libraries
Shmem: Total memory used by shared memory (shmem) and tmpfs
+ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
+ with huge pages
+ShmemPmdMapped: Shared memory mapped into userspace with huge pages
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d9c11d25b..a85355cf8 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
use at file creation time. When a task allocates a file in the file
system, the mount option memory policy will be applied with a NodeList,
if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroups/cpusets.txt] and any optional flags, listed
+[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed
below. If the resulting NodeLists is the empty set, the effective memory
policy for the file will revert to "default" policy.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c61a223ef..9ace359d6 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -364,7 +364,6 @@ struct inode_operations {
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
- int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
};
Again, all methods are called without any locks being held, unless
@@ -534,9 +533,7 @@ __sync_single_inode) to check if ->writepages has been successful in
writing out the whole address_space.
The Writeback tag is used by filemap*wait* and sync_page* functions,
-via filemap_fdatawait_range, to wait for all writeback to
-complete. While waiting ->sync_page (if defined) will be called on
-each page that is found to require writeback.
+via filemap_fdatawait_range, to wait for all writeback to complete.
An address_space handler may attach extra information to a page,
typically using the 'private' field in the 'struct page'. If such
@@ -554,8 +551,8 @@ address_space has finer control of write sizes.
The read process essentially only requires 'readpage'. The write
process is more complicated and uses write_begin/write_end or
-set_page_dirty to write data into the address_space, and writepage,
-sync_page, and writepages to writeback data to storage.
+set_page_dirty to write data into the address_space, and writepage
+and writepages to writeback data to storage.
Adding and removing pages to/from an address_space is protected by the
inode's i_mutex.
@@ -592,9 +589,14 @@ struct address_space_operations {
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+ /* isolate a page for migration */
+ bool (*isolate_page) (struct page *, isolate_mode_t);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
+ /* put migration-failed page back to right list */
+ void (*putback_page) (struct page *);
int (*launder_page) (struct page *);
+
int (*is_partially_uptodate) (struct page *, unsigned long,
unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
@@ -696,13 +698,6 @@ struct address_space_operations {
but instead uses bmap to find out where the blocks in the file
are and uses those addresses directly.
- dentry_open: *WARNING: probably going away soon, do not use!* This is an
- alternative to f_op->open(), the difference is that this method may open
- a file not necessarily originating from the same filesystem as the one
- i_op->open() was called on. It may be useful for stacking filesystems
- which want to allow native I/O directly on underlying files.
-
-
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
from the address space. This generally corresponds to either a
@@ -747,6 +742,10 @@ struct address_space_operations {
and transfer data directly between the storage and the
application's address space.
+ isolate_page: Called by the VM when isolating a movable non-lru page.
+ If page is successfully isolated, VM marks the page as PG_isolated
+ via __SetPageIsolated.
+
migrate_page: This is used to compact the physical memory usage.
If the VM wants to relocate a page (maybe off a memory card
that is signalling imminent failure) it will pass a new page
@@ -754,6 +753,8 @@ struct address_space_operations {
transfer any private data across and update any references
that it has to the page.
+ putback_page: Called by the VM when isolated page's migration fails.
+
launder_page: Called before freeing a page - it writes back the dirty page. To
prevent redirtying the page, it is kept locked during the whole
operation.
@@ -930,14 +931,17 @@ struct dentry_operations {
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
- int (*d_compare)(const struct dentry *, const struct dentry *,
+ int (*d_compare)(const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
+ int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool);
+ struct dentry *(*d_real)(struct dentry *, const struct inode *,
+ unsigned int);
};
d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -1003,6 +1007,8 @@ struct dentry_operations {
always cache a reachable dentry. d_delete must be constant and
idempotent.
+ d_init: called when a dentry is allocated
+
d_release: called when a dentry is really deallocated
d_iput: called when a dentry loses its inode (just prior to its
@@ -1022,6 +1028,14 @@ struct dentry_operations {
at the end of the buffer, and returns a pointer to the first char.
dynamic_dname() helper function is provided to take care of this.
+ Example :
+
+ static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+ {
+ return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ dentry->d_inode->i_ino);
+ }
+
d_automount: called when an automount dentry is to be traversed (optional).
This should create a new VFS mount record and return the record to the
caller. The caller is supplied with a path parameter giving the
@@ -1060,13 +1074,23 @@ struct dentry_operations {
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
dentry being transited from.
-Example :
+ d_real: overlay/union type filesystems implement this method to return one of
+ the underlying dentries hidden by the overlay. It is used in three
+ different modes:
-static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
-{
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
- dentry->d_inode->i_ino);
-}
+ Called from open it may need to copy-up the file depending on the
+ supplied open flags. This mode is selected with a non-zero flags
+ argument. In this mode the d_real method can return an error.
+
+ Called from file_dentry() it returns the real dentry matching the inode
+ argument. The real dentry may be from a lower layer already copied up,
+ but still referenced from the file. This mode is selected with a
+ non-NULL inode argument. This will always succeed.
+
+ With NULL inode and zero flags the topmost real underlying dentry is
+ returned. This will always succeed.
+
+ This method is never called with both non-NULL inode and non-zero flags.
Each dentry has a pointer to its parent dentry, as well as a hash list
of child dentries. Child dentries are basically like files in a