diff options
Diffstat (limited to 'Documentation/filesystems')
28 files changed, 865 insertions, 481 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 619af9bfd..d30fb2cb5 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -12,14 +12,17 @@ prototypes: int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, const struct dentry *, + int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); + int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(struct dentry *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *, + unsigned int); locking rules: rename_lock ->d_lock may block rcu-walk @@ -28,12 +31,14 @@ d_weak_revalidate:no no yes no d_hash no no no maybe d_compare: yes no no maybe d_delete: no yes no no +d_init: no no yes no d_release: no no yes no d_prune: no yes no no d_iput: no no yes no d_dname: no no no no d_automount: no no yes no d_manage: no no yes (ref-walk) maybe +d_real no no yes no --------------------------- inode_operations --------------------------- prototypes: @@ -66,7 +71,6 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -95,7 +99,6 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no -dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. @@ -179,7 +182,6 @@ unlocks and drops the reference. prototypes: int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); - int (*sync_page)(struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, @@ -194,8 +196,10 @@ prototypes: void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); - int (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset); + int (*direct_IO)(struct kiocb *, struct iov_iter *iter); + bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); + void (*putback_page) (struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); @@ -208,7 +212,6 @@ locking rules: PageLocked(page) i_mutex writepage: yes, unlocks (see below) readpage: yes, unlocks -sync_page: maybe writepages: set_page_dirty no readpages: @@ -219,15 +222,17 @@ invalidatepage: yes releasepage: yes freepage: yes direct_IO: +isolate_page: yes migratepage: yes (both) +putback_page: yes launder_page: yes is_partially_uptodate: yes error_remove_page: yes swap_activate: no swap_deactivate: no - ->write_begin(), ->write_end(), ->sync_page() and ->readpage() -may be called from the request handler (/dev/loop). + ->write_begin(), ->write_end() and ->readpage() may be called from +the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. @@ -283,11 +288,6 @@ will leave the page itself marked clean but it will be tagged as dirty in the radix tree. This incoherency can lead to all sorts of hard-to-debug problems in the filesystem like having dirty inodes at umount and losing written data. - ->sync_page() locking rules are not well-defined - usually it is called -with lock on page, but that is not guaranteed. Considering the currently -existing instances of this method ->sync_page() itself doesn't look -well-defined... - ->writepages() is used for periodic writeback and for syscall-initiated sync operations. The address_space should start I/O against at least *nr_to_write pages. *nr_to_write must be decremented for each page which is @@ -395,7 +395,7 @@ prototypes: int (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*direct_access) (struct block_device *, sector_t, void __pmem **, + int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); @@ -544,13 +544,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. ->map_pages() is called when VM asks to map easy accessible pages. -Filesystem should find and map pages associated with offsets from "pgoff" -till "max_pgoff". ->map_pages() is called with page table locked and must +Filesystem should find and map pages associated with offsets from "start_pgoff" +till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup -page table entry. Pointer to entry associated with offset "pgoff" is -passed in "pte" field in vm_fault structure. Pointers to entries for other -offsets should be calculated relative to "pte". +page table entry. Pointer to entry associated with the page is passed in +"pte" field in fault_env structure. Pointers to entries for other offsets +should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are diff --git a/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README index ed1bafe08..36df67408 100644 --- a/Documentation/filesystems/aufs/README +++ b/Documentation/filesystems/aufs/README @@ -372,6 +372,7 @@ James B made a donation (2014/7 and 2015/7). Stefano Di Biase made a donation (2014/8). Daniel Epellei made a donation (2015/1). OmegaPhil made a donation (2016/1). +Tomasz Szewczyk made a donation (2016/4). Thank you very much. Donations are always, including future donations, very important and diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt index c772b47e7..f9dad22d9 100644 --- a/Documentation/filesystems/btrfs.txt +++ b/Documentation/filesystems/btrfs.txt @@ -1,20 +1,10 @@ - BTRFS ===== -Btrfs is a copy on write filesystem for Linux aimed at -implementing advanced features while focusing on fault tolerance, -repair and easy administration. Initially developed by Oracle, Btrfs -is licensed under the GPL and open for contribution from anyone. - -Linux has a wealth of filesystems to choose from, but we are facing a -number of challenges with scaling to the large storage subsystems that -are becoming common in today's data centers. Filesystems need to scale -in their ability to address and manage large storage, and also in -their ability to detect, repair and tolerate errors in the data stored -on disk. Btrfs is under heavy development, and is not suitable for -any uses other than benchmarking and review. The Btrfs disk format is -not yet finalized. +Btrfs is a copy on write filesystem for Linux aimed at implementing advanced +features while focusing on fault tolerance, repair and easy administration. +Jointly developed by several companies, licensed under the GPL and open for +contribution from anyone. The main Btrfs features include: @@ -28,243 +18,14 @@ The main Btrfs features include: * Checksums on data and metadata (multiple algorithms available) * Compression * Integrated multiple device support, with several raid algorithms - * Online filesystem check (not yet implemented) - * Very fast offline filesystem check - * Efficient incremental backup and FS mirroring (not yet implemented) + * Offline filesystem check + * Efficient incremental backup and FS mirroring * Online filesystem defragmentation +For more information please refer to the wiki -Mount Options -============= - -When mounting a btrfs filesystem, the following option are accepted. -Options with (*) are default options and will not show in the mount options. - - alloc_start=<bytes> - Debugging option to force all block allocations above a certain - byte threshold on each block device. The value is specified in - bytes, optionally with a K, M, or G suffix, case insensitive. - Default is 1MB. - - noautodefrag(*) - autodefrag - Disable/enable auto defragmentation. - Auto defragmentation detects small random writes into files and queue - them up for the defrag process. Works best for small files; - Not well suited for large database workloads. - - check_int - check_int_data - check_int_print_mask=<value> - These debugging options control the behavior of the integrity checking - module (the BTRFS_FS_CHECK_INTEGRITY config option required). - - check_int enables the integrity checker module, which examines all - block write requests to ensure on-disk consistency, at a large - memory and CPU cost. - - check_int_data includes extent data in the integrity checks, and - implies the check_int option. - - check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values - as defined in fs/btrfs/check-integrity.c, to control the integrity - checker module behavior. - - See comments at the top of fs/btrfs/check-integrity.c for more info. - - commit=<seconds> - Set the interval of periodic commit, 30 seconds by default. Higher - values defer data being synced to permanent storage with obvious - consequences when the system crashes. The upper bound is not forced, - but a warning is printed if it's more than 300 seconds (5 minutes). - - compress - compress=<type> - compress-force - compress-force=<type> - Control BTRFS file data compression. Type may be specified as "zlib" - "lzo" or "no" (for no compression, used for remounting). If no type - is specified, zlib is used. If compress-force is specified, - all files will be compressed, whether or not they compress well. - If compression is enabled, nodatacow and nodatasum are disabled. - - degraded - Allow mounts to continue with missing devices. A read-write mount may - fail with too many devices missing, for example if a stripe member - is completely missing. - - device=<devicepath> - Specify a device during mount so that ioctls on the control device - can be avoided. Especially useful when trying to mount a multi-device - setup as root. May be specified multiple times for multiple devices. - - nodiscard(*) - discard - Disable/enable discard mount option. - Discard issues frequent commands to let the block device reclaim space - freed by the filesystem. - This is useful for SSD devices, thinly provisioned - LUNs and virtual machine images, but may have a significant - performance impact. (The fstrim command is also available to - initiate batch trims from userspace). - - noenospc_debug(*) - enospc_debug - Disable/enable debugging option to be more verbose in some ENOSPC conditions. - - fatal_errors=<action> - Action to take when encountering a fatal error: - "bug" - BUG() on a fatal error. This is the default. - "panic" - panic() on a fatal error. - - noflushoncommit(*) - flushoncommit - The 'flushoncommit' mount option forces any data dirtied by a write in a - prior transaction to commit as part of the current commit. This makes - the committed state a fully consistent view of the file system from the - application's perspective (i.e., it includes all completed file system - operations). This was previously the behavior only when a snapshot is - created. - - inode_cache - Enable free inode number caching. Defaults to off due to an overflow - problem when the free space crcs don't fit inside a single page. - - max_inline=<bytes> - Specify the maximum amount of space, in bytes, that can be inlined in - a metadata B-tree leaf. The value is specified in bytes, optionally - with a K, M, or G suffix, case insensitive. In practice, this value - is limited by the root sector size, with some space unavailable due - to leaf headers. For a 4k sector size, max inline data is ~3900 bytes. - - metadata_ratio=<value> - Specify that 1 metadata chunk should be allocated after every <value> - data chunks. Off by default. - - acl(*) - noacl - Enable/disable support for Posix Access Control Lists (ACLs). See the - acl(5) manual page for more information about ACLs. - - barrier(*) - nobarrier - Enable/disable the use of block layer write barriers. Write barriers - ensure that certain IOs make it through the device cache and are on - persistent storage. If disabled on a device with a volatile - (non-battery-backed) write-back cache, nobarrier option will lead to - filesystem corruption on a system crash or power loss. - - datacow(*) - nodatacow - Enable/disable data copy-on-write for newly created files. - Nodatacow implies nodatasum, and disables all compression. - - datasum(*) - nodatasum - Enable/disable data checksumming for newly created files. - Datasum implies datacow. - - treelog(*) - notreelog - Enable/disable the tree logging used for fsync and O_SYNC writes. - - recovery - Enable autorecovery attempts if a bad tree root is found at mount time. - Currently this scans a list of several previous tree roots and tries to - use the first readable. - - rescan_uuid_tree - Force check and rebuild procedure of the UUID tree. This should not - normally be needed. - - skip_balance - Skip automatic resume of interrupted balance operation after mount. - May be resumed with "btrfs balance resume." - - space_cache (*) - Enable the on-disk freespace cache. - nospace_cache - Disable freespace cache loading without clearing the cache. - clear_cache - Force clearing and rebuilding of the disk space cache if something - has gone wrong. - - ssd - nossd - ssd_spread - Options to control ssd allocation schemes. By default, BTRFS will - enable or disable ssd allocation heuristics depending on whether a - rotational or non-rotational disk is in use. The ssd and nossd options - can override this autodetection. - - The ssd_spread mount option attempts to allocate into big chunks - of unused space, and may perform better on low-end ssds. ssd_spread - implies ssd, enabling all other ssd heuristics as well. - - subvol=<path> - Mount subvolume at <path> rather than the root subvolume. <path> is - relative to the top level subvolume. - - subvolid=<ID> - Mount subvolume specified by an ID number rather than the root subvolume. - This allows mounting of subvolumes which are not in the root of the mounted - filesystem. - You can use "btrfs subvolume list" to see subvolume ID numbers. - - subvolrootid=<objectid> (deprecated) - Mount subvolume specified by <objectid> rather than the root subvolume. - This allows mounting of subvolumes which are not in the root of the mounted - filesystem. - You can use "btrfs subvolume show " to see the object ID for a subvolume. - - thread_pool=<number> - The number of worker threads to allocate. The default number is equal - to the number of CPUs + 2, or 8, whichever is smaller. - - user_subvol_rm_allowed - Allow subvolumes to be deleted by a non-root user. Use with caution. - -MAILING LIST -============ - -There is a Btrfs mailing list hosted on vger.kernel.org. You can -find details on how to subscribe here: - -http://vger.kernel.org/vger-lists.html#linux-btrfs - -Mailing list archives are available from gmane: - -http://dir.gmane.org/gmane.comp.file-systems.btrfs - - - -IRC -=== - -Discussion of Btrfs also occurs on the #btrfs channel of the Freenode -IRC network. - - - - UTILITIES - ========= - -Userspace tools for creating and manipulating Btrfs file systems are -available from the git repository at the following location: - - http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git - git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git - -These include the following tools: - -* mkfs.btrfs: create a filesystem - -* btrfs: a single tool to manage the filesystems, refer to the manpage for more details - -* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem - -Other tools for specific tasks: - -* btrfs-convert: in-place conversion from ext2/3/4 filesystems + https://btrfs.wiki.kernel.org -* btrfs-image: dump filesystem metadata for debugging +that maintains information about administration tasks, frequently asked +questions, use cases, mount options, comprehensible changelogs, features, +manual pages, source code repositories, contacts etc. diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README index 2d5622f60..a54788405 100644 --- a/Documentation/filesystems/cifs/README +++ b/Documentation/filesystems/cifs/README @@ -272,7 +272,7 @@ A partial list of the supported mount options follows: same domain (e.g. running winbind or nss_ldap) and the server supports the Unix Extensions then the uid and gid can be retrieved from the server (and uid - and gid would not have to be specifed on the mount. + and gid would not have to be specified on the mount. For servers which do not support the CIFS Unix extensions, the default uid (and gid) returned on lookup of existing files will be the uid (gid) of the person diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index e5fe521ee..8ec9136aa 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -250,7 +250,8 @@ child item. struct config_item cg_item; struct list_head cg_children; struct configfs_subsystem *cg_subsys; - struct config_group **default_groups; + struct list_head default_groups; + struct list_head group_entry; }; void config_group_init(struct config_group *group); @@ -420,15 +421,15 @@ These automatic subgroups, or default groups, do not preclude other children of the parent group. If ct_group_ops->make_group() exists, other child groups can be created on the parent group directly. -A configfs subsystem specifies default groups by filling in the -NULL-terminated array default_groups on the config_group structure. -Each group in that array is populated in the configfs tree at the same +A configfs subsystem specifies default groups by adding them using the +configfs_add_default_group() function to the parent config_group +structure. Each added group is populated in the configfs tree at the same time as the parent group. Similarly, they are removed at the same time as the parent. No extra notification is provided. When a ->drop_item() method call notifies the subsystem the parent group is going away, it also means every default group child associated with that parent group. -As a consequence of this, default_groups cannot be removed directly via +As a consequence of this, default groups cannot be removed directly via rmdir(2). They also are not considered when rmdir(2) on the parent group is checking for children. diff --git a/Documentation/filesystems/cramfs.txt b/Documentation/filesystems/cramfs.txt index 31f53f0ab..4006298f6 100644 --- a/Documentation/filesystems/cramfs.txt +++ b/Documentation/filesystems/cramfs.txt @@ -38,7 +38,7 @@ the update lasts only as long as the inode is cached in memory, after which the timestamp reverts to 1970, i.e. moves backwards in time. Currently, cramfs must be written and read with architectures of the -same endianness, and can be read only by kernels with PAGE_CACHE_SIZE +same endianness, and can be read only by kernels with PAGE_SIZE == 4096. At least the latter of these is a bug, but it hasn't been decided what the best fix is. For the moment if you have larger pages you can just change the #define in mkcramfs.c, so long as you don't diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 7bde64014..0c16a2252 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -49,6 +49,7 @@ These block devices may be used for inspiration: - axonram: Axon DDR2 device driver - brd: RAM backed block device driver - dcssblk: s390 dcss block device driver +- pmem: NVDIMM persistent memory driver Implementation Tips for Filesystem Writers @@ -75,8 +76,41 @@ calls to get_block() (for example by a page-fault racing with a read() or a write()) work correctly. These filesystems may be used for inspiration: -- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt -- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt +- ext2: see Documentation/filesystems/ext2.txt +- ext4: see Documentation/filesystems/ext4.txt +- xfs: see Documentation/filesystems/xfs.txt + + +Handling Media Errors +--------------------- + +The libnvdimm subsystem stores a record of known media error locations for +each pmem block device (in gendisk->badblocks). If we fault at such location, +or one with a latent error not yet discovered, the application can expect +to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply +writing the affected sectors (through the pmem driver, and if the underlying +NVDIMM supports the clear_poison DSM defined by ACPI). + +Since DAX IO normally doesn't go through the driver/bio path, applications or +sysadmins have an option to restore the lost data from a prior backup/inbuilt +redundancy in the following ways: + +1. Delete the affected file, and restore from a backup (sysadmin route): + This will free the file system blocks that were being used by the file, + and the next time they're allocated, they will be zeroed first, which + happens through the driver, and will clear bad sectors. + +2. Truncate or hole-punch the part of the file that has a bad-block (at least + an entire aligned sector has to be hole-punched, but not necessarily an + entire filesystem block). + +These are the two basic paths that allow DAX filesystems to continue operating +in the presence of media errors. More robust error recovery mechanisms can be +built on top of this in the future, for example, involving redundancy/mirroring +provided at the block layer through DM, or additionally, at the filesystem +level. These would have to rely on the above two tenets, that error clearing +can happen either by sending an IO through the driver, or zeroing (also through +the driver). Shortcomings diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt index 68dffd87f..9f94fe276 100644 --- a/Documentation/filesystems/devpts.txt +++ b/Documentation/filesystems/devpts.txt @@ -1,132 +1,26 @@ - -To support containers, we now allow multiple instances of devpts filesystem, -such that indices of ptys allocated in one instance are independent of indices -allocated in other instances of devpts. - -To preserve backward compatibility, this support for multiple instances is -enabled only if: - - - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and - - '-o newinstance' mount option is specified while mounting devpts - -IOW, devpts now supports both single-instance and multi-instance semantics. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and -this referred to as the "legacy" mode. In this mode, the new mount options -(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message -on console. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the -'newinstance' option (as in current start-up scripts) the new mount binds -to the initial kernel mount of devpts. This mode is referred to as the -'single-instance' mode and the current, single-instance semantics are -preserved, i.e PTYs are common across the system. - -The only difference between this single-instance mode and the legacy mode -is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which -can safely be ignored. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified, -the mount is considered to be in the multi-instance mode and a new instance -of the devpts fs is created. Any ptys created in this instance are independent -of ptys in other instances of devpts. Like in the single-instance mode, the -/dev/pts/ptmx node is present. To effectively use the multi-instance mode, -open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or -bind-mount. - -Eg: A container startup script could do the following: - - $ chmod 0666 /dev/pts/ptmx - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ ns_exec -cm /bin/bash - - # We are now in new container - - $ umount /dev/pts - $ mount -t devpts -o newinstance lxcpts /dev/pts - $ sshd -p 1234 - -where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs -/bin/bash in the child process. A pty created by the sshd is not visible in -the original mount of /dev/pts. - -User-space changes ------------------- - -In multi-instance mode (i.e '-o newinstance' mount option is specified at least -once), following user-space issues should be noted. - -1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored - and no change is needed to system-startup scripts. - -2. To effectively use multi-instance mode (i.e -o newinstance is specified) - administrators or startup scripts should "redirect" open of /dev/ptmx to - /dev/pts/ptmx using either a bind mount or symlink. - - $ mount -t devpts -o newinstance devpts /dev/pts - - followed by either - - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ chmod 666 /dev/pts/ptmx - or - $ mount -o bind /dev/pts/ptmx /dev/ptmx - -3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it - enables better error-reporting and treats both single-instance and - multi-instance mounts similarly. - - But this method requires that system-startup scripts set the mode of - /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the - mode by, either - - - adding ptmxmode mount option to devpts entry in /etc/fstab, or - - using 'chmod 0666 /dev/pts/ptmx' - -4. If multi-instance mode mount is needed for containers, but the system - startup scripts have not yet been updated, container-startup scripts - should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single- - instance mounts. - - Or, in general, container-startup scripts should use: - - mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts - if [ ! -L /dev/ptmx ]; then - mount -o bind /dev/pts/ptmx /dev/ptmx - fi - - When all devpts mounts are multi-instance, /dev/ptmx can permanently be - a symlink to pts/ptmx and the bind mount can be ignored. - -5. A multi-instance mount that is not accompanied by the /dev/ptmx to - /dev/pts/ptmx redirection would result in an unusable/unreachable pty. - - mount -t devpts -o newinstance lxcpts /dev/pts - - immediately followed by: - - open("/dev/ptmx") - - would create a pty, say /dev/pts/7, in the initial kernel mount. - But /dev/pts/7 would be invisible in the new mount. - -6. The permissions for /dev/pts/ptmx node should be specified when mounting - /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000). - - mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts - - The permissions can be later be changed as usual with 'chmod'. - - chmod 666 /dev/pts/ptmx - -7. A mount of devpts without the 'newinstance' option results in binding to - initial kernel mount. This behavior while preserving legacy semantics, - does not provide strict isolation in a container environment. i.e by - mounting devpts without the 'newinstance' option, a container could - get visibility into the 'host' or root container's devpts. - - To workaround this and have strict isolation, all mounts of devpts, - including the mount in the root container, should use the newinstance - option. +Each mount of the devpts filesystem is now distinct such that ptys +and their indicies allocated in one mount are independent from ptys +and their indicies in all other mounts. + +All mounts of the devpts filesystem now create a /dev/pts/ptmx node +with permissions 0000. + +To retain backwards compatibility the a ptmx device node (aka any node +created with "mknod name c 5 2") when opened will look for an instance +of devpts under the name "pts" in the same directory as the ptmx device +node. + +As an option instead of placing a /dev/ptmx device node at /dev/ptmx +it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or +to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using +the devpts filesystem in this manner devpts should be mounted with +the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called. + +Total count of pty pairs in all instances is limited by sysctls: +kernel.pty.max = 4096 - global limit +kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace +kernel.pty.nr - current count of ptys + +Per-instance limit could be set by adding mount option "max=<count>". +This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve. +In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit. diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking index 09bbf9a54..c314badbc 100644 --- a/Documentation/filesystems/directory-locking +++ b/Documentation/filesystems/directory-locking @@ -1,30 +1,37 @@ Locking scheme used for directory operations is based on two -kinds of locks - per-inode (->i_mutex) and per-filesystem +kinds of locks - per-inode (->i_rwsem) and per-filesystem (->s_vfs_rename_mutex). - When taking the i_mutex on multiple non-directory objects, we + When taking the i_rwsem on multiple non-directory objects, we always acquire the locks in order by increasing address. We'll call that "inode pointer" order in the following. For our purposes all operations fall in 5 classes: 1) read access. Locking rules: caller locks directory we are accessing. +The lock is taken shared. -2) object creation. Locking rules: same as above. +2) object creation. Locking rules: same as above, but the lock is taken +exclusive. 3) object removal. Locking rules: caller locks parent, finds victim, -locks victim and calls the method. +locks victim and calls the method. Locks are exclusive. 4) rename() that is _not_ cross-directory. Locking rules: caller locks -the parent and finds source and target. If target already exists, lock -it. If source is a non-directory, lock it. If that means we need to -lock both, lock them in inode pointer order. +the parent and finds source and target. In case of exchange (with +RENAME_EXCHANGE in rename2() flags argument) lock both. In any case, +if the target already exists, lock it. If the source is a non-directory, +lock it. If we need to lock both, lock them in inode pointer order. +Then call the method. All locks are exclusive. +NB: we might get away with locking the the source (and target in exchange +case) shared. 5) link creation. Locking rules: * lock parent * check that source is not a directory * lock source * call the method. +All locks are exclusive. 6) cross-directory rename. The trickiest in the whole bunch. Locking rules: @@ -35,11 +42,12 @@ rules: fail with -ENOTEMPTY * if new parent is equal to or is a descendent of source fail with -ELOOP - * If target exists, lock it. If source is a non-directory, lock - it. In case that means we need to lock both source and target, - do so in inode pointer order. + * If it's an exchange, lock both the source and the target. + * If the target exists, lock it. If the source is a non-directory, + lock it. If we need to lock both, do so in inode pointer order. * call the method. - +All ->i_rwsem are taken exclusive. Again, we might get away with locking +the the source (and target in exchange case) shared. The rules above obviously guarantee that all directories that are going to be read, modified or removed by method will be locked by caller. @@ -73,7 +81,7 @@ objects - A < B iff A is an ancestor of B. attempt to acquire some lock and already holds at least one lock. Let's consider the set of contended locks. First of all, filesystem lock is not contended, since any process blocked on it is not holding any locks. -Thus all processes are blocked on ->i_mutex. +Thus all processes are blocked on ->i_rwsem. By (3), any process holding a non-directory lock can only be waiting on another non-directory lock with a larger address. Therefore diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e1c9f0849..ecd808088 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -109,7 +109,9 @@ background_gc=%s Turn on/off cleaning operations, namely garbage disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -151,6 +153,9 @@ noinline_data Disable the inline data feature, inline data feature is enabled by default. data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES diff --git a/Documentation/filesystems/nfs/fault_injection.txt b/Documentation/filesystems/nfs/fault_injection.txt index 426d16608..f3a5b0a8a 100644 --- a/Documentation/filesystems/nfs/fault_injection.txt +++ b/Documentation/filesystems/nfs/fault_injection.txt @@ -49,13 +49,13 @@ forget_locks: forget_delegations: A delegation is used to assure the client that a file, or part of a file, has not changed since the delegation was awarded. Clearing this list will - force the client to reaquire its delegation before accessing the file + force the client to reacquire its delegation before accessing the file again. recall_delegations: Delegations can be recalled by the server when another client attempts to access a file. This test will notify the client that its delegation has - been revoked, forcing the client to reaquire the delegation before using + been revoked, forcing the client to reacquire the delegation before using the file again. diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt index 906b6c233..1e6564545 100644 --- a/Documentation/filesystems/nfs/nfs-rdma.txt +++ b/Documentation/filesystems/nfs/nfs-rdma.txt @@ -218,7 +218,7 @@ NFS/RDMA Setup /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) The IP address(es) is(are) the client's IPoIB address for an InfiniBand - HCA or the cleint's iWARP address(es) for an RNIC. + HCA or the client's iWARP address(es) for an RNIC. NOTE: The "insecure" option must be used because the NFS/RDMA client does not use a reserved port. diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index bb5ab6de5..0b2883b17 100644 --- a/Documentation/filesystems/nfs/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt @@ -166,7 +166,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>: Value gets exported by /proc/net/pnp which is often linked on embedded systems by /etc/resolv.conf. - <dns1-ip> IP address of secound nameserver. + <dns1-ip> IP address of second nameserver. Same as above. diff --git a/Documentation/filesystems/nfs/pnfs-scsi-server.txt b/Documentation/filesystems/nfs/pnfs-scsi-server.txt new file mode 100644 index 000000000..5bef7268b --- /dev/null +++ b/Documentation/filesystems/nfs/pnfs-scsi-server.txt @@ -0,0 +1,23 @@ + +pNFS SCSI layout server user guide +================================== + +This document describes support for pNFS SCSI layouts in the Linux NFS server. +With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS, +which in addition to handling all the metadata access to the NFS export, +also hands out layouts to the clients so that they can directly access the +underlying SCSI LUNs that are shared with the client. + +To use pNFS SCSI layouts with with the Linux NFS server, the exported file +system needs to support the pNFS SCSI layouts (currently just XFS), and the +file system must sit on a SCSI LUN that is accessible to the clients in +addition to the MDS. As of now the file system needs to sit directly on the +exported LUN, striping or concatenation of LUNs on the MDS and clients +is not supported yet. + +On a server built with CONFIG_NFSD_SCSI, the pNFS SCSI volume support is +automatically enabled if the file system is exported using the "pnfs" +option and the underlying SCSI device support persistent reservations. +On the client make sure the kernel has the CONFIG_PNFS_BLOCK option +enabled, and the file system is mounted using the NFSv4.1 protocol +version (mount -o vers=4.1). diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt index 44a9f2493..8de578a98 100644 --- a/Documentation/filesystems/nfs/pnfs.txt +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -64,8 +64,8 @@ table which are called by the nfs-client pnfs-core to implement the different layout types. Files-layout-driver code is in: fs/nfs/filelayout/.. directory -Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory -Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory +Objects-layout-driver code is in: fs/nfs/objlayout/.. directory +Blocks-layout-driver code is in: fs/nfs/blocklayout/.. directory Flexfiles-layout-driver code is in: fs/nfs/flexfilelayout/.. directory objects-layout setup @@ -91,7 +91,7 @@ The API to the login script is as follows: Usage: $0 -u <URI> -o <OSDNAME> -s <SYSTEMID> Options: -u target uri e.g. iscsi://<ip>:<port> - (allways exists) + (always exists) (More protocols can be defined in the future. The client does not interpret this string it is passed unchanged as received from the Server) diff --git a/Documentation/filesystems/nfs/rpc-server-gss.txt b/Documentation/filesystems/nfs/rpc-server-gss.txt index 716f4be8e..310bbbaf9 100644 --- a/Documentation/filesystems/nfs/rpc-server-gss.txt +++ b/Documentation/filesystems/nfs/rpc-server-gss.txt @@ -57,7 +57,7 @@ the Kerberos tickets, that needs to be sent through the GSS layer in order to perform context establishment. B) It does not properly handle creds where the user is member of more -than a few housand groups (the current hard limit in the kernel is 65K +than a few thousand groups (the current hard limit in the kernel is 65K groups) due to limitation on the size of the buffer that can be send back to the kernel (4KiB). diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 41c3d332a..c0727dc36 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -267,4 +267,10 @@ among NILFS2 files can be depicted as follows: `-- file (ino=yy) ( regular file, directory, or symlink ) -For detail on the format of each file, please see include/linux/nilfs2_fs.h. +For detail on the format of each file, please see nilfs2_ondisk.h +located at include/uapi/linux directory. + +There are no patents or other intellectual property that we protect +with regard to the design of NILFS2. It is allowed to replicate the +design in hopes that other operating systems could share (mount, read, +write, etc.) data stored in this format. diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt new file mode 100644 index 000000000..139fab175 --- /dev/null +++ b/Documentation/filesystems/ocfs2-online-filecheck.txt @@ -0,0 +1,94 @@ + OCFS2 online file check + ----------------------- + +This document will describe OCFS2 online file check feature. + +Introduction +============ +OCFS2 is often used in high-availability systems. However, OCFS2 usually +converts the filesystem to read-only when encounters an error. This may not be +necessary, since turning the filesystem read-only would affect other running +processes as well, decreasing availability. +Then, a mount option (errors=continue) is introduced, which would return the +-EIO errno to the calling process and terminate further processing so that the +filesystem is not corrupted further. The filesystem is not converted to +read-only, and the problematic file's inode number is reported in the kernel +log. The user can try to check/fix this file via online filecheck feature. + +Scope +===== +This effort is to check/fix small issues which may hinder day-to-day operations +of a cluster filesystem by turning the filesystem read-only. The scope of +checking/fixing is at the file level, initially for regular files and eventually +to all files (including system files) of the filesystem. + +In case of directory to file links is incorrect, the directory inode is +reported as erroneous. + +This feature is not suited for extravagant checks which involve dependency of +other components of the filesystem, such as but not limited to, checking if the +bits for file blocks in the allocation has been set. In case of such an error, +the offline fsck should/would be recommended. + +Finally, such an operation/feature should not be automated lest the filesystem +may end up with more damage than before the repair attempt. So, this has to +be performed using user interaction and consent. + +User interface +============== +When there are errors in the OCFS2 filesystem, they are usually accompanied +by the inode number which caused the error. This inode number would be the +input to check/fix the file. + +There is a sysfs directory for each OCFS2 file system mounting: + + /sys/fs/ocfs2/<devname>/filecheck + +Here, <devname> indicates the name of OCFS2 volume device which has been already +mounted. The file above would accept inode numbers. This could be used to +communicate with kernel space, tell which file(inode number) will be checked or +fixed. Currently, three operations are supported, which includes checking +inode, fixing inode and setting the size of result record history. + +1. If you want to know what error exactly happened to <inode> before fixing, do + + # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/check + # cat /sys/fs/ocfs2/<devname>/filecheck/check + +The output is like this: + INO DONE ERROR +39502 1 GENERATION + +<INO> lists the inode numbers. +<DONE> indicates whether the operation has been finished. +<ERROR> says what kind of errors was found. For the detailed error numbers, +please refer to the file linux/fs/ocfs2/filecheck.h. + +2. If you determine to fix this inode, do + + # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/fix + # cat /sys/fs/ocfs2/<devname>/filecheck/fix + +The output is like this: + INO DONE ERROR +39502 1 SUCCESS + +This time, the <ERROR> column indicates whether this fix is successful or not. + +3. The record cache is used to store the history of check/fix results. It's +default size is 10, and can be adjust between the range of 10 ~ 100. You can +adjust the size like this: + + # echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set + +Fixing stuff +============ +On receiving the inode, the filesystem would read the inode and the +file metadata. In case of errors, the filesystem would fix the errors +and report the problems it fixed in the kernel log. As a precautionary measure, +the inode must first be checked for errors before performing a final fix. + +The inode and the result history will be maintained temporarily in a +small linked list buffer which would contain the last (N) inodes +fixed/checked, the detailed errors which were fixed/checked are printed in the +kernel log. diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt new file mode 100644 index 000000000..1dfdec790 --- /dev/null +++ b/Documentation/filesystems/orangefs.txt @@ -0,0 +1,448 @@ +ORANGEFS +======== + +OrangeFS is an LGPL userspace scale-out parallel storage system. It is ideal +for large storage problems faced by HPC, BigData, Streaming Video, +Genomics, Bioinformatics. + +Orangefs, originally called PVFS, was first developed in 1993 by +Walt Ligon and Eric Blumer as a parallel file system for Parallel +Virtual Machine (PVM) as part of a NASA grant to study the I/O patterns +of parallel programs. + +Orangefs features include: + + * Distributes file data among multiple file servers + * Supports simultaneous access by multiple clients + * Stores file data and metadata on servers using local file system + and access methods + * Userspace implementation is easy to install and maintain + * Direct MPI support + * Stateless + + +MAILING LIST +============ + +http://beowulf-underground.org/mailman/listinfo/pvfs2-users + + +DOCUMENTATION +============= + +http://www.orangefs.org/documentation/ + + +USERSPACE FILESYSTEM SOURCE +=========================== + +http://www.orangefs.org/download + +Orangefs versions prior to 2.9.3 would not be compatible with the +upstream version of the kernel client. + + +BUILDING THE USERSPACE FILESYSTEM ON A SINGLE SERVER +==================================================== + +When Orangefs is upstream, "--with-kernel" shouldn't be needed, but +until then the path to where the kernel with the Orangefs kernel client +patch was built is needed to ensure that pvfs2-client-core (the bridge +between kernel space and user space) will build properly. You can omit +--prefix if you don't care that things are sprinkled around in +/usr/local. + +./configure --prefix=/opt/ofs --with-kernel=/path/to/orangefs/kernel + +make + +make install + +Create an orangefs config file: +/opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf + + for "Enter hostnames", use the hostname, don't let it default to + localhost. + +create a pvfs2tab file in /etc: +cat /etc/pvfs2tab +tcp://myhostname:3334/orangefs /mymountpoint pvfs2 defaults,noauto 0 0 + +create the mount point you specified in the tab file if needed: +mkdir /mymountpoint + +bootstrap the server: +/opt/ofs/sbin/pvfs2-server /etc/pvfs2.conf -f + +start the server: +/opt/osf/sbin/pvfs2-server /etc/pvfs2.conf + +Now the server is running. At this point you might like to +prove things are working with: + +/opt/osf/bin/pvfs2-ls /mymountpoint + +You might not want to enforce selinux, it doesn't seem to matter by +linux 3.11... + +If stuff seems to be working, turn on the client core: +/opt/osf/sbin/pvfs2-client -p /opt/osf/sbin/pvfs2-client-core + +Mount your filesystem. +mount -t pvfs2 tcp://myhostname:3334/orangefs /mymountpoint + + +OPTIONS +======= + +The following mount options are accepted: + + acl + Allow the use of Access Control Lists on files and directories. + + intr + Some operations between the kernel client and the user space + filesystem can be interruptible, such as changes in debug levels + and the setting of tunable parameters. + + local_lock + Enable posix locking from the perspective of "this" kernel. The + default file_operations lock action is to return ENOSYS. Posix + locking kicks in if the filesystem is mounted with -o local_lock. + Distributed locking is being worked on for the future. + + +DEBUGGING +========= + +If you want the debug (GOSSIP) statements in a particular +source file (inode.c for example) go to syslog: + + echo inode > /sys/kernel/debug/orangefs/kernel-debug + +No debugging (the default): + + echo none > /sys/kernel/debug/orangefs/kernel-debug + +Debugging from several source files: + + echo inode,dir > /sys/kernel/debug/orangefs/kernel-debug + +All debugging: + + echo all > /sys/kernel/debug/orangefs/kernel-debug + +Get a list of all debugging keywords: + + cat /sys/kernel/debug/orangefs/debug-help + + +PROTOCOL BETWEEN KERNEL MODULE AND USERSPACE +============================================ + +Orangefs is a user space filesystem and an associated kernel module. +We'll just refer to the user space part of Orangefs as "userspace" +from here on out. Orangefs descends from PVFS, and userspace code +still uses PVFS for function and variable names. Userspace typedefs +many of the important structures. Function and variable names in +the kernel module have been transitioned to "orangefs", and The Linux +Coding Style avoids typedefs, so kernel module structures that +correspond to userspace structures are not typedefed. + +The kernel module implements a pseudo device that userspace +can read from and write to. Userspace can also manipulate the +kernel module through the pseudo device with ioctl. + +THE BUFMAP: + +At startup userspace allocates two page-size-aligned (posix_memalign) +mlocked memory buffers, one is used for IO and one is used for readdir +operations. The IO buffer is 41943040 bytes and the readdir buffer is +4194304 bytes. Each buffer contains logical chunks, or partitions, and +a pointer to each buffer is added to its own PVFS_dev_map_desc structure +which also describes its total size, as well as the size and number of +the partitions. + +A pointer to the IO buffer's PVFS_dev_map_desc structure is sent to a +mapping routine in the kernel module with an ioctl. The structure is +copied from user space to kernel space with copy_from_user and is used +to initialize the kernel module's "bufmap" (struct orangefs_bufmap), which +then contains: + + * refcnt - a reference counter + * desc_size - PVFS2_BUFMAP_DEFAULT_DESC_SIZE (4194304) - the IO buffer's + partition size, which represents the filesystem's block size and + is used for s_blocksize in super blocks. + * desc_count - PVFS2_BUFMAP_DEFAULT_DESC_COUNT (10) - the number of + partitions in the IO buffer. + * desc_shift - log2(desc_size), used for s_blocksize_bits in super blocks. + * total_size - the total size of the IO buffer. + * page_count - the number of 4096 byte pages in the IO buffer. + * page_array - a pointer to page_count * (sizeof(struct page*)) bytes + of kcalloced memory. This memory is used as an array of pointers + to each of the pages in the IO buffer through a call to get_user_pages. + * desc_array - a pointer to desc_count * (sizeof(struct orangefs_bufmap_desc)) + bytes of kcalloced memory. This memory is further intialized: + + user_desc is the kernel's copy of the IO buffer's ORANGEFS_dev_map_desc + structure. user_desc->ptr points to the IO buffer. + + pages_per_desc = bufmap->desc_size / PAGE_SIZE + offset = 0 + + bufmap->desc_array[0].page_array = &bufmap->page_array[offset] + bufmap->desc_array[0].array_count = pages_per_desc = 1024 + bufmap->desc_array[0].uaddr = (user_desc->ptr) + (0 * 1024 * 4096) + offset += 1024 + . + . + . + bufmap->desc_array[9].page_array = &bufmap->page_array[offset] + bufmap->desc_array[9].array_count = pages_per_desc = 1024 + bufmap->desc_array[9].uaddr = (user_desc->ptr) + + (9 * 1024 * 4096) + offset += 1024 + + * buffer_index_array - a desc_count sized array of ints, used to + indicate which of the IO buffer's partitions are available to use. + * buffer_index_lock - a spinlock to protect buffer_index_array during update. + * readdir_index_array - a five (ORANGEFS_READDIR_DEFAULT_DESC_COUNT) element + int array used to indicate which of the readdir buffer's partitions are + available to use. + * readdir_index_lock - a spinlock to protect readdir_index_array during + update. + +OPERATIONS: + +The kernel module builds an "op" (struct orangefs_kernel_op_s) when it +needs to communicate with userspace. Part of the op contains the "upcall" +which expresses the request to userspace. Part of the op eventually +contains the "downcall" which expresses the results of the request. + +The slab allocator is used to keep a cache of op structures handy. + +At init time the kernel module defines and initializes a request list +and an in_progress hash table to keep track of all the ops that are +in flight at any given time. + +Ops are stateful: + + * unknown - op was just initialized + * waiting - op is on request_list (upward bound) + * inprogr - op is in progress (waiting for downcall) + * serviced - op has matching downcall; ok + * purged - op has to start a timer since client-core + exited uncleanly before servicing op + * given up - submitter has given up waiting for it + +When some arbitrary userspace program needs to perform a +filesystem operation on Orangefs (readdir, I/O, create, whatever) +an op structure is initialized and tagged with a distinguishing ID +number. The upcall part of the op is filled out, and the op is +passed to the "service_operation" function. + +Service_operation changes the op's state to "waiting", puts +it on the request list, and signals the Orangefs file_operations.poll +function through a wait queue. Userspace is polling the pseudo-device +and thus becomes aware of the upcall request that needs to be read. + +When the Orangefs file_operations.read function is triggered, the +request list is searched for an op that seems ready-to-process. +The op is removed from the request list. The tag from the op and +the filled-out upcall struct are copy_to_user'ed back to userspace. + +If any of these (and some additional protocol) copy_to_users fail, +the op's state is set to "waiting" and the op is added back to +the request list. Otherwise, the op's state is changed to "in progress", +and the op is hashed on its tag and put onto the end of a list in the +in_progress hash table at the index the tag hashed to. + +When userspace has assembled the response to the upcall, it +writes the response, which includes the distinguishing tag, back to +the pseudo device in a series of io_vecs. This triggers the Orangefs +file_operations.write_iter function to find the op with the associated +tag and remove it from the in_progress hash table. As long as the op's +state is not "canceled" or "given up", its state is set to "serviced". +The file_operations.write_iter function returns to the waiting vfs, +and back to service_operation through wait_for_matching_downcall. + +Service operation returns to its caller with the op's downcall +part (the response to the upcall) filled out. + +The "client-core" is the bridge between the kernel module and +userspace. The client-core is a daemon. The client-core has an +associated watchdog daemon. If the client-core is ever signaled +to die, the watchdog daemon restarts the client-core. Even though +the client-core is restarted "right away", there is a period of +time during such an event that the client-core is dead. A dead client-core +can't be triggered by the Orangefs file_operations.poll function. +Ops that pass through service_operation during a "dead spell" can timeout +on the wait queue and one attempt is made to recycle them. Obviously, +if the client-core stays dead too long, the arbitrary userspace processes +trying to use Orangefs will be negatively affected. Waiting ops +that can't be serviced will be removed from the request list and +have their states set to "given up". In-progress ops that can't +be serviced will be removed from the in_progress hash table and +have their states set to "given up". + +Readdir and I/O ops are atypical with respect to their payloads. + + - readdir ops use the smaller of the two pre-allocated pre-partitioned + memory buffers. The readdir buffer is only available to userspace. + The kernel module obtains an index to a free partition before launching + a readdir op. Userspace deposits the results into the indexed partition + and then writes them to back to the pvfs device. + + - io (read and write) ops use the larger of the two pre-allocated + pre-partitioned memory buffers. The IO buffer is accessible from + both userspace and the kernel module. The kernel module obtains an + index to a free partition before launching an io op. The kernel module + deposits write data into the indexed partition, to be consumed + directly by userspace. Userspace deposits the results of read + requests into the indexed partition, to be consumed directly + by the kernel module. + +Responses to kernel requests are all packaged in pvfs2_downcall_t +structs. Besides a few other members, pvfs2_downcall_t contains a +union of structs, each of which is associated with a particular +response type. + +The several members outside of the union are: + - int32_t type - type of operation. + - int32_t status - return code for the operation. + - int64_t trailer_size - 0 unless readdir operation. + - char *trailer_buf - initialized to NULL, used during readdir operations. + +The appropriate member inside the union is filled out for any +particular response. + + PVFS2_VFS_OP_FILE_IO + fill a pvfs2_io_response_t + + PVFS2_VFS_OP_LOOKUP + fill a PVFS_object_kref + + PVFS2_VFS_OP_CREATE + fill a PVFS_object_kref + + PVFS2_VFS_OP_SYMLINK + fill a PVFS_object_kref + + PVFS2_VFS_OP_GETATTR + fill in a PVFS_sys_attr_s (tons of stuff the kernel doesn't need) + fill in a string with the link target when the object is a symlink. + + PVFS2_VFS_OP_MKDIR + fill a PVFS_object_kref + + PVFS2_VFS_OP_STATFS + fill a pvfs2_statfs_response_t with useless info <g>. It is hard for + us to know, in a timely fashion, these statistics about our + distributed network filesystem. + + PVFS2_VFS_OP_FS_MOUNT + fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref + except its members are in a different order and "__pad1" is replaced + with "id". + + PVFS2_VFS_OP_GETXATTR + fill a pvfs2_getxattr_response_t + + PVFS2_VFS_OP_LISTXATTR + fill a pvfs2_listxattr_response_t + + PVFS2_VFS_OP_PARAM + fill a pvfs2_param_response_t + + PVFS2_VFS_OP_PERF_COUNT + fill a pvfs2_perf_count_response_t + + PVFS2_VFS_OP_FSKEY + file a pvfs2_fs_key_response_t + + PVFS2_VFS_OP_READDIR + jamb everything needed to represent a pvfs2_readdir_response_t into + the readdir buffer descriptor specified in the upcall. + +Userspace uses writev() on /dev/pvfs2-req to pass responses to the requests +made by the kernel side. + +A buffer_list containing: + - a pointer to the prepared response to the request from the + kernel (struct pvfs2_downcall_t). + - and also, in the case of a readdir request, a pointer to a + buffer containing descriptors for the objects in the target + directory. +... is sent to the function (PINT_dev_write_list) which performs +the writev. + +PINT_dev_write_list has a local iovec array: struct iovec io_array[10]; + +The first four elements of io_array are initialized like this for all +responses: + + io_array[0].iov_base = address of local variable "proto_ver" (int32_t) + io_array[0].iov_len = sizeof(int32_t) + + io_array[1].iov_base = address of global variable "pdev_magic" (int32_t) + io_array[1].iov_len = sizeof(int32_t) + + io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t) + io_array[2].iov_len = sizeof(int64_t) + + io_array[3].iov_base = address of out_downcall member (pvfs2_downcall_t) + of global variable vfs_request (vfs_request_t) + io_array[3].iov_len = sizeof(pvfs2_downcall_t) + +Readdir responses initialize the fifth element io_array like this: + + io_array[4].iov_base = contents of member trailer_buf (char *) + from out_downcall member of global variable + vfs_request + io_array[4].iov_len = contents of member trailer_size (PVFS_size) + from out_downcall member of global variable + vfs_request + +Orangefs exploits the dcache in order to avoid sending redundant +requests to userspace. We keep object inode attributes up-to-date with +orangefs_inode_getattr. Orangefs_inode_getattr uses two arguments to +help it decide whether or not to update an inode: "new" and "bypass". +Orangefs keeps private data in an object's inode that includes a short +timeout value, getattr_time, which allows any iteration of +orangefs_inode_getattr to know how long it has been since the inode was +updated. When the object is not new (new == 0) and the bypass flag is not +set (bypass == 0) orangefs_inode_getattr returns without updating the inode +if getattr_time has not timed out. Getattr_time is updated each time the +inode is updated. + +Creation of a new object (file, dir, sym-link) includes the evaluation of +its pathname, resulting in a negative directory entry for the object. +A new inode is allocated and associated with the dentry, turning it from +a negative dentry into a "productive full member of society". Orangefs +obtains the new inode from Linux with new_inode() and associates +the inode with the dentry by sending the pair back to Linux with +d_instantiate(). + +The evaluation of a pathname for an object resolves to its corresponding +dentry. If there is no corresponding dentry, one is created for it in +the dcache. Whenever a dentry is modified or verified Orangefs stores a +short timeout value in the dentry's d_time, and the dentry will be trusted +for that amount of time. Orangefs is a network filesystem, and objects +can potentially change out-of-band with any particular Orangefs kernel module +instance, so trusting a dentry is risky. The alternative to trusting +dentries is to always obtain the needed information from userspace - at +least a trip to the client-core, maybe to the servers. Obtaining information +from a dentry is cheap, obtaining it from userspace is relatively expensive, +hence the motivation to use the dentry when possible. + +The timeout values d_time and getattr_time are jiffy based, and the +code is designed to avoid the jiffy-wrap problem: + +"In general, if the clock may have wrapped around more than once, there +is no way to tell how much time has elapsed. However, if the times t1 +and t2 are known to be fairly close, we can reliably compute the +difference in a way that takes into account the possibility that the +clock may have wrapped between times." + + from course notes by instructor Andy Wang + diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 28091457b..bcbf9710e 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -183,26 +183,15 @@ The copy_up operation essentially creates a new, identical file and moves it over to the old name. The new file may be on a different filesystem, so both st_dev and st_ino of the file may change. -Any open files referring to this inode will access the old data and -metadata. Similarly any file locks obtained before copy_up will not -apply to the copied up file. +Any open files referring to this inode will access the old data. -On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and -fsetxattr(2) will fail with EROFS. +Any file locks (and leases) obtained before copy_up will not apply +to the copied up file. If a file with multiple hard links is copied up, then this will "break" the link. Changes will not be propagated to other names referring to the same inode. -Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory -object in overlayfs will not contain valid absolute paths, only -relative paths leading up to the filesystem's root. This will be -fixed in the future. - -Some operations are not atomic, for example a crash during copy_up or -rename will leave the filesystem in an inconsistent state. This will -be addressed in the future. - Changes to underlying filesystems --------------------------------- diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt index 8aef91335..106d17fbb 100644 --- a/Documentation/filesystems/pohmelfs/design_notes.txt +++ b/Documentation/filesystems/pohmelfs/design_notes.txt @@ -29,7 +29,7 @@ Main features of this FS include: * Read request (data read, directory listing, lookup requests) balancing between multiple servers. * Write requests are replicated to multiple servers and completed only when all of them are acked. * Ability to add and/or remove servers from the working set at run-time. - * Strong authentification and possible data encryption in network channel. + * Strong authentication and possible data encryption in network channel. * Extended attributes support. POHMELFS is based on transactions, which are potentially long-standing objects that live diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index f1b87d8aa..b1bd05ea6 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -525,3 +525,70 @@ in your dentry operations instead. set_delayed_call() where it used to set *cookie. ->put_link() is gone - just give the destructor to set_delayed_call() in ->get_link(). +-- +[mandatory] + ->getxattr() and xattr_handler.get() get dentry and inode passed separately. + dentry might be yet to be attached to inode, so do _not_ use its ->d_inode + in the instances. Rationale: !@#!@# security_d_instantiate() needs to be + called before we attach dentry to inode. +-- +[mandatory] + symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/ + i_pipe/i_link union zeroed out at inode eviction. As the result, you can't + assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that + it's a symlink. Checking ->i_mode is really needed now. In-tree we had + to fix shmem_destroy_callback() that used to take that kind of shortcut; + watch out, since that shortcut is no longer valid. +-- +[mandatory] + ->i_mutex is replaced with ->i_rwsem now. inode_lock() et.al. work as + they used to - they just take it exclusive. However, ->lookup() may be + called with parent locked shared. Its instances must not + * use d_instantiate) and d_rehash() separately - use d_add() or + d_splice_alias() instead. + * use d_rehash() alone - call d_add(new_dentry, NULL) instead. + * in the unlikely case when (read-only) access to filesystem + data structures needs exclusion for some reason, arrange it + yourself. None of the in-tree filesystems needed that. + * rely on ->d_parent and ->d_name not changing after dentry has + been fed to d_add() or d_splice_alias(). Again, none of the + in-tree instances relied upon that. + We are guaranteed that lookups of the same name in the same directory + will not happen in parallel ("same" in the sense of your ->d_compare()). + Lookups on different names in the same directory can and do happen in + parallel now. +-- +[recommended] + ->iterate_shared() is added; it's a parallel variant of ->iterate(). + Exclusion on struct file level is still provided (as well as that + between it and lseek on the same struct file), but if your directory + has been opened several times, you can get these called in parallel. + Exclusion between that method and all directory-modifying ones is + still provided, of course. + + Often enough ->iterate() can serve as ->iterate_shared() without any + changes - it is a read-only operation, after all. If you have any + per-inode or per-dentry in-core data structures modified by ->iterate(), + you might need something to serialize the access to them. If you + do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for + that; look for in-tree examples. + + Old method is only used if the new one is absent; eventually it will + be removed. Switch while you still can; the old one won't stay. +-- +[mandatory] + ->atomic_open() calls without O_CREAT may happen in parallel. +-- +[mandatory] + ->setxattr() and xattr_handler.set() get dentry and inode passed separately. + dentry might be yet to be attached to inode, so do _not_ use its ->d_inode + in the instances. Rationale: !@#!@# security_d_instantiate() needs to be + called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack + ->d_instantiate() uses not just ->getxattr() but ->setxattr() as well. +-- +[mandatory] + ->d_compare() doesn't get parent as a separate argument anymore. If you + used it for finding the struct super_block involved, dentry->d_sb will + work just as well; if it's something more complicated, use dentry->d_parent. + Just be careful not to assume that fetching it more than once will yield + the same value - in RCU mode it could change under you. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 843b045b4..68080ad6a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -43,6 +43,7 @@ Table of Contents 3.7 /proc/<pid>/task/<tid>/children - Information about task children 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file 3.9 /proc/<pid>/map_files - Information about memory mapped files + 3.10 /proc/<pid>/timerslack_ns - Task timerslack value 4 Configuring procfs 4.1 Mount options @@ -224,6 +225,7 @@ Table 1-2: Contents of the status files (as of 4.1) TracerPid PID of process tracing this process (0 if not) Uid Real, effective, saved set, and file system UIDs Gid Real, effective, saved set, and file system GIDs + Umask file mode creation mask FDSize number of file descriptor slots currently allocated Groups supplementary group list NStgid descendant namespace thread group ID hierarchy @@ -434,6 +436,7 @@ Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB AnonHugePages: 0 kB +ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB @@ -462,6 +465,8 @@ accessed. a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. +"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by +huge pages. "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. @@ -723,7 +728,7 @@ IRQ, you can set it by doing: > echo 1 > /proc/irq/10/smp_affinity This means that only the first CPU will handle the IRQ, but you can also echo -5 which means that only the first and fourth CPU can handle the IRQ. +5 which means that only the first and third CPU can handle the IRQ. The contents of each smp_affinity file is the same by default: @@ -866,6 +871,9 @@ VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB AnonHugePages: 49152 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB + MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) @@ -910,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new AnonHugePages: Non-file backed huge pages mapped into userspace page tables Mapped: files which have been mmaped, such as libraries Shmem: Total memory used by shared memory (shmem) and tmpfs +ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated + with huge pages +ShmemPmdMapped: Shared memory mapped into userspace with huge pages Slab: in-kernel data structures cache SReclaimable: Part of Slab, that might be reclaimed, such as caches SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure @@ -1862,6 +1873,23 @@ time one can open(2) mappings from the listings of two processes and comparing their inode numbers to figure out which anonymous memory areas are actually shared. +3.10 /proc/<pid>/timerslack_ns - Task timerslack value +--------------------------------------------------------- +This file provides the value of the task's timerslack value in nanoseconds. +This value specifies a amount of time that normal timers may be deferred +in order to coalesce timers and avoid unnecessary wakeups. + +This allows a task's interactivity vs power consumption trade off to be +adjusted. + +Writing 0 to the file will set the tasks timerslack to the default value. + +Valid values are from 0 - ULLONG_MAX + +An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level +permissions on the task specified to change its timerslack_ns value. + + ------------------------------------------------------------------------------ Configuring procfs ------------------------------------------------------------------------------ diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt index 408679789..4f3d6a882 100644 --- a/Documentation/filesystems/qnx6.txt +++ b/Documentation/filesystems/qnx6.txt @@ -16,7 +16,7 @@ qnx6fs shares many properties with traditional Unix filesystems. It has the concepts of blocks, inodes and directories. On QNX it is possible to create little endian and big endian qnx6 filesystems. This feature makes it possible to create and use a different endianness fs -for the target (QNX is used on quite a range of embedded systems) plattform +for the target (QNX is used on quite a range of embedded systems) platform running on a different endianness. The Linux driver handles endianness transparently. (LE and BE) diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index e3f4c778e..8ccfbd552 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -123,7 +123,7 @@ replicas continue to be exactly same. 2d) A unbindable mount is a unbindable private mount - let's say we have a mount at /mnt and we make is unbindable + let's say we have a mount at /mnt and we make it unbindable # mount --make-unbindable /mnt @@ -197,13 +197,13 @@ replicas continue to be exactly same. namespaces are made first class objects with user API to associate/disassociate a namespace with userid, then each user could have his/her own namespace and tailor it to his/her - requirements. Offcourse its needs support from PAM. + requirements. This needs to be supported in PAM. D) Versioned files If the entire mount tree is visible at multiple locations, then - a underlying versioning file system can return different - version of the file depending on the path used to access that + an underlying versioning file system can return different + versions of the file depending on the path used to access that file. An example is: diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index d392e1505..a85355cf8 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt @@ -60,7 +60,7 @@ size: The limit of allocated bytes for this tmpfs instance. The default is half of your physical RAM without swap. If you oversize your tmpfs instances the machine will deadlock since the OOM handler will not be able to free that memory. -nr_blocks: The same as size, but in blocks of PAGE_CACHE_SIZE. +nr_blocks: The same as size, but in blocks of PAGE_SIZE. nr_inodes: The maximum number of inodes for this instance. The default is half of the number of your physical RAM pages, or (on a machine with highmem) the number of lowmem RAM pages, @@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for use at file creation time. When a task allocates a file in the file system, the mount option memory policy will be applied with a NodeList, if any, modified by the calling task's cpuset constraints -[See Documentation/cgroups/cpusets.txt] and any optional flags, listed +[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed below. If the resulting NodeLists is the empty set, the effective memory policy for the file will revert to "default" policy. diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index 223c32171..cf51360e3 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -56,9 +56,10 @@ iocharset=<name> -- Character set to use for converting between the you should consider the following option instead. utf8=<bool> -- UTF-8 is the filesystem safe version of Unicode that - is used by the console. It can be enabled for the - filesystem with this option. If 'uni_xlate' gets set, - UTF-8 gets disabled. + is used by the console. It can be enabled or disabled + for the filesystem with this option. + If 'uni_xlate' gets set, UTF-8 gets disabled. + By default, FAT_DEFAULT_UTF8 setting is used. uni_xlate=<bool> -- Translate unhandled Unicode characters to special escaped sequences. This would let you backup and diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b02a7d598..9ace359d6 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,7 +364,6 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -534,9 +533,7 @@ __sync_single_inode) to check if ->writepages has been successful in writing out the whole address_space. The Writeback tag is used by filemap*wait* and sync_page* functions, -via filemap_fdatawait_range, to wait for all writeback to -complete. While waiting ->sync_page (if defined) will be called on -each page that is found to require writeback. +via filemap_fdatawait_range, to wait for all writeback to complete. An address_space handler may attach extra information to a page, typically using the 'private' field in the 'struct page'. If such @@ -554,8 +551,8 @@ address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage, -sync_page, and writepages to writeback data to storage. +set_page_dirty to write data into the address_space, and writepage +and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the inode's i_mutex. @@ -591,10 +588,15 @@ struct address_space_operations { void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); - ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* isolate a page for migration */ + bool (*isolate_page) (struct page *, isolate_mode_t); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); + /* put migration-failed page back to right list */ + void (*putback_page) (struct page *); int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); @@ -696,21 +698,14 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. - dentry_open: *WARNING: probably going away soon, do not use!* This is an - alternative to f_op->open(), the difference is that this method may open - a file not necessarily originating from the same filesystem as the one - i_op->open() was called on. It may be useful for stacking filesystems - which want to allow native I/O directly on underlying files. - - invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed from the address space. This generally corresponds to either a truncation, punch hole or a complete invalidation of the address space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_CACHE_SIZE). Any private data associated with the page + will be PAGE_SIZE). Any private data associated with the page should be updated to reflect this truncation. If offset is 0 and - length is PAGE_CACHE_SIZE, then the private data should be released, + length is PAGE_SIZE, then the private data should be released, because the page must be able to be completely discarded. This may be done by calling the ->releasepage function, but in this case the release MUST succeed. @@ -747,6 +742,10 @@ struct address_space_operations { and transfer data directly between the storage and the application's address space. + isolate_page: Called by the VM when isolating a movable non-lru page. + If page is successfully isolated, VM marks the page as PG_isolated + via __SetPageIsolated. + migrate_page: This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is signalling imminent failure) it will pass a new page @@ -754,6 +753,8 @@ struct address_space_operations { transfer any private data across and update any references that it has to the page. + putback_page: Called by the VM when isolated page's migration fails. + launder_page: Called before freeing a page - it writes back the dirty page. To prevent redirtying the page, it is kept locked during the whole operation. @@ -930,14 +931,17 @@ struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, const struct dentry *, + int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); + int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *, + unsigned int); }; d_revalidate: called when the VFS needs to revalidate a dentry. This @@ -1003,6 +1007,8 @@ struct dentry_operations { always cache a reachable dentry. d_delete must be constant and idempotent. + d_init: called when a dentry is allocated + d_release: called when a dentry is really deallocated d_iput: called when a dentry loses its inode (just prior to its @@ -1022,6 +1028,14 @@ struct dentry_operations { at the end of the buffer, and returns a pointer to the first char. dynamic_dname() helper function is provided to take care of this. + Example : + + static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) + { + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + dentry->d_inode->i_ino); + } + d_automount: called when an automount dentry is to be traversed (optional). This should create a new VFS mount record and return the record to the caller. The caller is supplied with a path parameter giving the @@ -1060,13 +1074,23 @@ struct dentry_operations { This function is only used if DCACHE_MANAGE_TRANSIT is set on the dentry being transited from. -Example : + d_real: overlay/union type filesystems implement this method to return one of + the underlying dentries hidden by the overlay. It is used in three + different modes: -static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); -} + Called from open it may need to copy-up the file depending on the + supplied open flags. This mode is selected with a non-zero flags + argument. In this mode the d_real method can return an error. + + Called from file_dentry() it returns the real dentry matching the inode + argument. The real dentry may be from a lower layer already copied up, + but still referenced from the file. This mode is selected with a + non-NULL inode argument. This will always succeed. + + With NULL inode and zero flags the topmost real underlying dentry is + returned. This will always succeed. + + This method is never called with both non-NULL inode and non-zero flags. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a |