From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Wed, 5 Aug 2015 17:04:01 -0300 Subject: Initial import --- fs/reiserfs/Kconfig | 88 + fs/reiserfs/Makefile | 38 + fs/reiserfs/README | 161 ++ fs/reiserfs/acl.h | 76 + fs/reiserfs/bitmap.c | 1468 ++++++++++++++ fs/reiserfs/dir.c | 346 ++++ fs/reiserfs/do_balan.c | 1911 ++++++++++++++++++ fs/reiserfs/file.c | 270 +++ fs/reiserfs/fix_node.c | 2825 ++++++++++++++++++++++++++ fs/reiserfs/hashes.c | 177 ++ fs/reiserfs/ibalance.c | 1160 +++++++++++ fs/reiserfs/inode.c | 3461 ++++++++++++++++++++++++++++++++ fs/reiserfs/ioctl.c | 230 +++ fs/reiserfs/item_ops.c | 752 +++++++ fs/reiserfs/journal.c | 4403 +++++++++++++++++++++++++++++++++++++++++ fs/reiserfs/lbalance.c | 1427 +++++++++++++ fs/reiserfs/lock.c | 100 + fs/reiserfs/namei.c | 1659 ++++++++++++++++ fs/reiserfs/objectid.c | 217 ++ fs/reiserfs/prints.c | 777 ++++++++ fs/reiserfs/procfs.c | 508 +++++ fs/reiserfs/reiserfs.h | 3411 +++++++++++++++++++++++++++++++ fs/reiserfs/resize.c | 229 +++ fs/reiserfs/stree.c | 2262 +++++++++++++++++++++ fs/reiserfs/super.c | 2563 ++++++++++++++++++++++++ fs/reiserfs/tail_conversion.c | 317 +++ fs/reiserfs/xattr.c | 1064 ++++++++++ fs/reiserfs/xattr.h | 122 ++ fs/reiserfs/xattr_acl.c | 407 ++++ fs/reiserfs/xattr_security.c | 120 ++ fs/reiserfs/xattr_trusted.c | 56 + fs/reiserfs/xattr_user.c | 52 + 32 files changed, 32657 insertions(+) create mode 100644 fs/reiserfs/Kconfig create mode 100644 fs/reiserfs/Makefile create mode 100644 fs/reiserfs/README create mode 100644 fs/reiserfs/acl.h create mode 100644 fs/reiserfs/bitmap.c create mode 100644 fs/reiserfs/dir.c create mode 100644 fs/reiserfs/do_balan.c create mode 100644 fs/reiserfs/file.c create mode 100644 fs/reiserfs/fix_node.c create mode 100644 fs/reiserfs/hashes.c create mode 100644 fs/reiserfs/ibalance.c create mode 100644 fs/reiserfs/inode.c create mode 100644 fs/reiserfs/ioctl.c create mode 100644 fs/reiserfs/item_ops.c create mode 100644 fs/reiserfs/journal.c create mode 100644 fs/reiserfs/lbalance.c create mode 100644 fs/reiserfs/lock.c create mode 100644 fs/reiserfs/namei.c create mode 100644 fs/reiserfs/objectid.c create mode 100644 fs/reiserfs/prints.c create mode 100644 fs/reiserfs/procfs.c create mode 100644 fs/reiserfs/reiserfs.h create mode 100644 fs/reiserfs/resize.c create mode 100644 fs/reiserfs/stree.c create mode 100644 fs/reiserfs/super.c create mode 100644 fs/reiserfs/tail_conversion.c create mode 100644 fs/reiserfs/xattr.c create mode 100644 fs/reiserfs/xattr.h create mode 100644 fs/reiserfs/xattr_acl.c create mode 100644 fs/reiserfs/xattr_security.c create mode 100644 fs/reiserfs/xattr_trusted.c create mode 100644 fs/reiserfs/xattr_user.c (limited to 'fs/reiserfs') diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig new file mode 100644 index 000000000..7cd46666b --- /dev/null +++ b/fs/reiserfs/Kconfig @@ -0,0 +1,88 @@ +config REISERFS_FS + tristate "Reiserfs support" + select CRC32 + help + Stores not just filenames but the files themselves in a balanced + tree. Uses journalling. + + Balanced trees are more efficient than traditional file system + architectural foundations. + + In general, ReiserFS is as fast as ext2, but is very efficient with + large directories and small files. Additional patches are needed + for NFS and quotas, please see + for links. + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based file + systems are. The next version will be so extended, and will support + plugins consistent with our motto ``It takes more than a license to + make source code open.'' + + Read + to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +config REISERFS_CHECK + bool "Enable reiserfs debug mode" + depends on REISERFS_FS + help + If you set this to Y, then ReiserFS will perform every check it can + possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say Y and you might get a useful error message. Almost + everyone should say N. + +config REISERFS_PROC_INFO + bool "Stats in /proc/fs/reiserfs" + depends on REISERFS_FS && PROC_FS + help + Create under /proc/fs/reiserfs a hierarchy of files, displaying + various ReiserFS statistics and internal data at the expense of + making your kernel or module slightly larger (+8 KB). This also + increases the amount of kernel memory required for each mount. + Almost everyone but ReiserFS developers and people fine-tuning + reiserfs or tracing problems should say N. + +config REISERFS_FS_XATTR + bool "ReiserFS extended attributes" + depends on REISERFS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config REISERFS_FS_POSIX_ACL + bool "ReiserFS POSIX Access Control Lists" + depends on REISERFS_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config REISERFS_FS_SECURITY + bool "ReiserFS Security Labels" + depends on REISERFS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ReiserFS filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile new file mode 100644 index 000000000..3c3b00165 --- /dev/null +++ b/fs/reiserfs/Makefile @@ -0,0 +1,38 @@ +# +# Makefile for the linux reiser-filesystem routines. +# + +obj-$(CONFIG_REISERFS_FS) += reiserfs.o + +reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ + super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ + hashes.o tail_conversion.o journal.o resize.o \ + item_ops.o ioctl.o xattr.o lock.o + +ifeq ($(CONFIG_REISERFS_PROC_INFO),y) +reiserfs-objs += procfs.o +endif + +ifeq ($(CONFIG_REISERFS_FS_XATTR),y) +reiserfs-objs += xattr_user.o xattr_trusted.o +endif + +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) +reiserfs-objs += xattr_security.o +endif + +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) +reiserfs-objs += xattr_acl.o +endif + +# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline +# functions are used. This causes the compiler to advance the stack +# pointer out of the available stack space, corrupting kernel space, +# and causing a panic. Since this behavior only affects ppc32, this ifeq +# will work around it. If any other architecture displays this behavior, +# add it here. +ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1) + +TAGS: + etags *.c + diff --git a/fs/reiserfs/README b/fs/reiserfs/README new file mode 100644 index 000000000..e2f7a264e --- /dev/null +++ b/fs/reiserfs/README @@ -0,0 +1,161 @@ +[LICENSING] + +ReiserFS is hereby licensed under the GNU General +Public License version 2. + +Source code files that contain the phrase "licensing governed by +reiserfs/README" are "governed files" throughout this file. Governed +files are licensed under the GPL. The portions of them owned by Hans +Reiser, or authorized to be licensed by him, have been in the past, +and likely will be in the future, licensed to other parties under +other licenses. If you add your code to governed files, and don't +want it to be owned by Hans Reiser, put your copyright label on that +code so the poor blight and his customers can keep things straight. +All portions of governed files not labeled otherwise are owned by Hans +Reiser, and by adding your code to it, widely distributing it to +others or sending us a patch, and leaving the sentence in stating that +licensing is governed by the statement in this file, you accept this. +It will be a kindness if you identify whether Hans Reiser is allowed +to license code labeled as owned by you on your behalf other than +under the GPL, because he wants to know if it is okay to do so and put +a check in the mail to you (for non-trivial improvements) when he +makes his next sale. He makes no guarantees as to the amount if any, +though he feels motivated to motivate contributors, and you can surely +discuss this with him before or after contributing. You have the +right to decline to allow him to license your code contribution other +than under the GPL. + +Further licensing options are available for commercial and/or other +interests directly from Hans Reiser: hans@reiser.to. If you interpret +the GPL as not allowing those additional licensing options, you read +it wrongly, and Richard Stallman agrees with me, when carefully read +you can see that those restrictions on additional terms do not apply +to the owner of the copyright, and my interpretation of this shall +govern for this license. + +Finally, nothing in this license shall be interpreted to allow you to +fail to fairly credit me, or to remove my credits, without my +permission, unless you are an end user not redistributing to others. +If you have doubts about how to properly do that, or about what is +fair, ask. (Last I spoke with him Richard was contemplating how best +to address the fair crediting issue in the next GPL version.) + +[END LICENSING] + +Reiserfs is a file system based on balanced tree algorithms, which is +described at https://reiser4.wiki.kernel.org/index.php/Main_Page + +Stop reading here. Go there, then return. + +Send bug reports to yura@namesys.botik.ru. + +mkreiserfs and other utilities are in reiserfs/utils, or wherever your +Linux provider put them. There is some disagreement about how useful +it is for users to get their fsck and mkreiserfs out of sync with the +version of reiserfs that is in their kernel, with many important +distributors wanting them out of sync.:-) Please try to remember to +recompile and reinstall fsck and mkreiserfs with every update of +reiserfs, this is a common source of confusion. Note that some of the +utilities cannot be compiled without accessing the balancing code +which is in the kernel code, and relocating the utilities may require +you to specify where that code can be found. + +Yes, if you update your reiserfs kernel module you do have to +recompile your kernel, most of the time. The errors you get will be +quite cryptic if your forget to do so. + +Real users, as opposed to folks who want to hack and then understand +what went wrong, will want REISERFS_CHECK off. + +Hideous Commercial Pitch: Spread your development costs across other OS +vendors. Select from the best in the world, not the best in your +building, by buying from third party OS component suppliers. Leverage +the software component development power of the internet. Be the most +aggressive in taking advantage of the commercial possibilities of +decentralized internet development, and add value through your branded +integration that you sell as an operating system. Let your competitors +be the ones to compete against the entire internet by themselves. Be +hip, get with the new economic trend, before your competitors do. Send +email to hans@reiser.to. + +To understand the code, after reading the website, start reading the +code by reading reiserfs_fs.h first. + +Hans Reiser was the project initiator, primary architect, source of all +funding for the first 5.5 years, and one of the programmers. He owns +the copyright. + +Vladimir Saveljev was one of the programmers, and he worked long hours +writing the cleanest code. He always made the effort to be the best he +could be, and to make his code the best that it could be. What resulted +was quite remarkable. I don't think that money can ever motivate someone +to work the way he did, he is one of the most selfless men I know. + +Yura helps with benchmarking, coding hashes, and block pre-allocation +code. + +Anatoly Pinchuk is a former member of our team who worked closely with +Vladimir throughout the project's development. He wrote a quite +substantial portion of the total code. He realized that there was a +space problem with packing tails of files for files larger than a node +that start on a node aligned boundary (there are reasons to want to node +align files), and he invented and implemented indirect items and +unformatted nodes as the solution. + +Konstantin Shvachko, with the help of the Russian version of a VC, +tried to put me in a position where I was forced into giving control +of the project to him. (Fortunately, as the person paying the money +for all salaries from my dayjob I owned all copyrights, and you can't +really force takeovers of sole proprietorships.) This was something +curious, because he never really understood the value of our project, +why we should do what we do, or why innovation was possible in +general, but he was sure that he ought to be controlling it. Every +innovation had to be forced past him while he was with us. He added +two years to the time required to complete reiserfs, and was a net +loss for me. Mikhail Gilula was a brilliant innovator who also left +in a destructive way that erased the value of his contributions, and +that he was shown much generosity just makes it more painful. + +Grigory Zaigralin was an extremely effective system administrator for +our group. + +Igor Krasheninnikov was wonderful at hardware procurement, repair, and +network installation. + +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a +textbook he got the algorithm from in the code. Note that his analysis +of how we could use the hashing code in making 32 bit NFS cookies work +was probably more important than the actual algorithm. Colin Plumb also +contributed to it. + +Chris Mason dived right into our code, and in just a few months produced +the journaling code that dramatically increased the value of ReiserFS. +He is just an amazing programmer. + +Igor Zagorovsky is writing much of the new item handler and extent code +for our next major release. + +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the +resizer, and is hard at work on implementing allocate on flush. SGI +implemented allocate on flush before us for XFS, and generously took +the time to convince me we should do it also. They are great people, +and a great company. + +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. + +Vitaly Fertman is doing fsck. + +Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably +the endian safe patches which allow ReiserFS to run on any platform +supported by the Linux kernel. + +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the +Alpha PC Company made it possible for me to not have a day job +anymore, and to dramatically increase our staffing. Ecila funded +hypertext feature development, MP3.com funded journaling, SuSE funded +core development, IntegratedLinux.com funded squid web cache +appliances, bigstorage.com funded HSM, and the alpha PC company funded +the alpha port. Many of these tasks were helped by sponsors other +than the ones just named. SuSE has helped in much more than just +funding.... + diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h new file mode 100644 index 000000000..4a211f5b3 --- /dev/null +++ b/fs/reiserfs/acl.h @@ -0,0 +1,76 @@ +#include +#include + +#define REISERFS_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} reiserfs_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} reiserfs_acl_entry_short; + +typedef struct { + __le32 a_version; +} reiserfs_acl_header; + +static inline size_t reiserfs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(reiserfs_acl_header) + + count * sizeof(reiserfs_acl_entry_short); + } else { + return sizeof(reiserfs_acl_header) + + 4 * sizeof(reiserfs_acl_entry_short) + + (count - 4) * sizeof(reiserfs_acl_entry); + } +} + +static inline int reiserfs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(reiserfs_acl_header); + s = size - 4 * sizeof(reiserfs_acl_entry_short); + if (s < 0) { + if (size % sizeof(reiserfs_acl_entry_short)) + return -1; + return size / sizeof(reiserfs_acl_entry_short); + } else { + if (s % sizeof(reiserfs_acl_entry)) + return -1; + return s / sizeof(reiserfs_acl_entry) + 4; + } +} + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + struct inode *dir, struct dentry *dentry, + struct inode *inode); +int reiserfs_cache_default_acl(struct inode *dir); + +#else + +#define reiserfs_cache_default_acl(inode) 0 +#define reiserfs_get_acl NULL +#define reiserfs_set_acl NULL + +static inline int reiserfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + const struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + return 0; +} +#endif diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c new file mode 100644 index 000000000..dc198bc64 --- /dev/null +++ b/fs/reiserfs/bitmap.c @@ -0,0 +1,1468 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +/* Reiserfs block (de)allocator, bitmap-based. */ + +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include + +#define PREALLOCATION_SIZE 9 + +/* different reiserfs block allocator options */ + +#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) + +#define _ALLOC_concentrating_formatted_nodes 0 +#define _ALLOC_displacing_large_files 1 +#define _ALLOC_displacing_new_packing_localities 2 +#define _ALLOC_old_hashed_relocation 3 +#define _ALLOC_new_hashed_relocation 4 +#define _ALLOC_skip_busy 5 +#define _ALLOC_displace_based_on_dirid 6 +#define _ALLOC_hashed_formatted_nodes 7 +#define _ALLOC_old_way 8 +#define _ALLOC_hundredth_slices 9 +#define _ALLOC_dirid_groups 10 +#define _ALLOC_oid_groups 11 +#define _ALLOC_packing_groups 12 + +#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) +#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) +#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) + +#define SET_OPTION(optname) \ + do { \ + reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \ + set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ + } while(0) +#define TEST_OPTION(optname, s) \ + test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) + +static inline void get_bit_address(struct super_block *s, + b_blocknr_t block, + unsigned int *bmap_nr, + unsigned int *offset) +{ + /* + * It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. + */ + *bmap_nr = block >> (s->s_blocksize_bits + 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block & ((s->s_blocksize << 3) - 1); +} + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) +{ + unsigned int bmap, offset; + unsigned int bmap_count = reiserfs_bmap_count(s); + + if (block == 0 || block >= SB_BLOCK_COUNT(s)) { + reiserfs_error(s, "vs-4010", + "block number is out of range %lu (%u)", + block, SB_BLOCK_COUNT(s)); + return 0; + } + + get_bit_address(s, block, &bmap, &offset); + + /* + * Old format filesystem? Unlikely, but the bitmaps are all + * up front so we need to account for it. + */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &REISERFS_SB(s)->s_properties))) { + b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; + if (block >= bmap1 && + block <= bmap1 + bmap_count) { + reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } else { + if (offset == 0) { + reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) " + "can't be freed or reused", + block, bmap_count); + return 0; + } + } + + if (bmap >= bmap_count) { + reiserfs_error(s, "vs-4030", "bitmap for requested block " + "is out of range: block=%lu, bitmap_nr=%u", + block, bmap); + return 0; + } + + if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { + reiserfs_error(s, "vs-4050", "this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK(s)); + return 0; + } + + return 1; +} + +/* + * Searches in journal structures for a given block number (bmap, off). + * If block is found in reiserfs journal it suggests next free block + * candidate to test. + */ +static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, + int off, int *next) +{ + b_blocknr_t tmp; + + if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { + if (tmp) { /* hint supplied */ + *next = tmp; + PROC_INFO_INC(s, scan_bitmap.in_journal_hint); + } else { + (*next) = off + 1; /* inc offset to avoid looping. */ + PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); + } + PROC_INFO_INC(s, scan_bitmap.retry); + return 1; + } + return 0; +} + +/* + * Searches for a window of zero bits with given minimum and maximum + * lengths in one bitmap block + */ +static int scan_bitmap_block(struct reiserfs_transaction_handle *th, + unsigned int bmap_n, int *beg, int boundary, + int min, int max, int unfm) +{ + struct super_block *s = th->t_super; + struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; + struct buffer_head *bh; + int end, next; + int org = *beg; + + BUG_ON(!th->t_trans_id); + RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " + "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); + PROC_INFO_INC(s, scan_bitmap.bmap); + + if (!bi) { + reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " + "for bitmap %d", bmap_n); + return 0; + } + + bh = reiserfs_read_bitmap_block(s, bmap_n); + if (bh == NULL) + return 0; + + while (1) { +cont: + if (bi->free_count < min) { + brelse(bh); + return 0; /* No free blocks in this bitmap */ + } + + /* search for a first zero bit -- beginning of a window */ + *beg = reiserfs_find_next_zero_le_bit + ((unsigned long *)(bh->b_data), boundary, *beg); + + /* + * search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size + */ + if (*beg + min > boundary) { + brelse(bh); + return 0; + } + + if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) + continue; + /* first zero bit found; we check next bits */ + for (end = *beg + 1;; end++) { + if (end >= *beg + max || end >= boundary + || reiserfs_test_le_bit(end, bh->b_data)) { + next = end; + break; + } + + /* + * finding the other end of zero bit window requires + * looking into journal structures (in case of + * searching for free blocks for unformatted nodes) + */ + if (unfm && is_block_in_journal(s, bmap_n, end, &next)) + break; + } + + /* + * now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end + */ + + /* found window of proper size */ + if (end - *beg >= min) { + int i; + reiserfs_prepare_for_journal(s, bh, 1); + /* + * try to set all blocks used checking are + * they still free + */ + for (i = *beg; i < end; i++) { + /* Don't check in journal again. */ + if (reiserfs_test_and_set_le_bit + (i, bh->b_data)) { + /* + * bit was set by another process while + * we slept in prepare_for_journal() + */ + PROC_INFO_INC(s, scan_bitmap.stolen); + + /* + * we can continue with smaller set + * of allocated blocks, if length of + * this set is more or equal to `min' + */ + if (i >= *beg + min) { + end = i; + break; + } + + /* + * otherwise we clear all bit + * were set ... + */ + while (--i >= *beg) + reiserfs_clear_le_bit + (i, bh->b_data); + reiserfs_restore_prepared_buffer(s, bh); + *beg = org; + + /* + * Search again in current block + * from beginning + */ + goto cont; + } + } + bi->free_count -= (end - *beg); + journal_mark_dirty(th, bh); + brelse(bh); + + /* free block count calculation */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + return end - (*beg); + } else { + *beg = next; + } + } +} + +static int bmap_hash_id(struct super_block *s, u32 id) +{ + char *hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % reiserfs_bmap_count(s); + if (!bm) + bm = 1; + } + /* this can only be true when SB_BMAP_NR = 1 */ + if (bm >= reiserfs_bmap_count(s)) + bm = 0; + return bm; +} + +/* + * hashes the id and then returns > 0 if the block group for the + * corresponding hash is full + */ +static inline int block_group_used(struct super_block *s, u32 id) +{ + int bm = bmap_hash_id(s, id); + struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; + + /* + * If we don't have cached information on this bitmap block, we're + * going to have to load it later anyway. Loading it here allows us + * to make a better decision. This favors long-term performance gain + * with a better on-disk layout vs. a short term gain of skipping the + * read and potentially having a bad placement. + */ + if (info->free_count == UINT_MAX) { + struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); + brelse(bh); + } + + if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) { + return 0; + } + return 1; +} + +/* + * the packing is returned in disk byte order + */ +__le32 reiserfs_choose_packing(struct inode * dir) +{ + __le32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); + /* + * some versions of reiserfsck expect packing locality 1 to be + * special + */ + if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; +} + +/* + * Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. + */ +static int scan_bitmap(struct reiserfs_transaction_handle *th, + b_blocknr_t * start, b_blocknr_t finish, + int min, int max, int unfm, sector_t file_block) +{ + int nr_allocated = 0; + struct super_block *s = th->t_super; + unsigned int bm, off; + unsigned int end_bm, end_off; + unsigned int off_max = s->s_blocksize << 3; + + BUG_ON(!th->t_trans_id); + PROC_INFO_INC(s, scan_bitmap.call); + + /* No point in looking for more free blocks */ + if (SB_FREE_BLOCKS(s) <= 0) + return 0; + + get_bit_address(s, *start, &bm, &off); + get_bit_address(s, finish, &end_bm, &end_off); + if (bm > reiserfs_bmap_count(s)) + return 0; + if (end_bm > reiserfs_bmap_count(s)) + end_bm = reiserfs_bmap_count(s); + + /* + * When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ + if (TEST_OPTION(skip_busy, s) + && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { + for (; bm < end_bm; bm++, off = 0) { + if ((off && (!unfm || (file_block != 0))) + || SB_AP_BITMAP(s)[bm].free_count > + (s->s_blocksize << 3) / 10) + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, + min, max, unfm); + if (nr_allocated) + goto ret; + } + /* we know from above that start is a reasonable number */ + get_bit_address(s, *start, &bm, &off); + } + + for (; bm < end_bm; bm++, off = 0) { + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + + nr_allocated = + scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); + +ret: + *start = bm * off_max + off; + return nr_allocated; + +} + +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs; + struct buffer_head *sbh, *bmbh; + struct reiserfs_bitmap_info *apbi; + unsigned int nr, offset; + + BUG_ON(!th->t_trans_id); + PROC_INFO_INC(s, free_block); + rs = SB_DISK_SUPER_BLOCK(s); + sbh = SB_BUFFER_WITH_SB(s); + apbi = SB_AP_BITMAP(s); + + get_bit_address(s, block, &nr, &offset); + + if (nr >= reiserfs_bmap_count(s)) { + reiserfs_error(s, "vs-4075", "block %lu is out of range", + block); + return; + } + + bmbh = reiserfs_read_bitmap_block(s, nr); + if (!bmbh) + return; + + reiserfs_prepare_for_journal(s, bmbh, 1); + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { + reiserfs_error(s, "vs-4080", + "block %lu: bit already cleared", block); + } + apbi[nr].free_count++; + journal_mark_dirty(th, bmbh); + brelse(bmbh); + + reiserfs_prepare_for_journal(s, sbh, 1); + /* update super block */ + set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); + + journal_mark_dirty(th, sbh); + if (for_unformatted) { + int depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(inode, 1); + reiserfs_write_lock_nested(s, depth); + } +} + +void reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) +{ + struct super_block *s = th->t_super; + + BUG_ON(!th->t_trans_id); + RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); + if (!is_reusable(s, block, 1)) + return; + + if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { + reiserfs_error(th->t_super, "bitmap-4072", + "Trying to free block outside file system " + "boundaries (%lu > %lu)", + block, sb_block_count(REISERFS_SB(s)->s_rs)); + return; + } + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block); + _reiserfs_free_block(th, inode, block, for_unformatted); +} + +/* preallocated blocks don't need to be run through journal_mark_freed */ +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block) +{ + BUG_ON(!th->t_trans_id); + RFALSE(!th->t_super, + "vs-4060: trying to free block on nonexistent device"); + if (!is_reusable(th->t_super, block, 1)) + return; + _reiserfs_free_block(th, inode, block, 1); +} + +static void __discard_prealloc(struct reiserfs_transaction_handle *th, + struct reiserfs_inode_info *ei) +{ + unsigned long save = ei->i_prealloc_block; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; + + BUG_ON(!th->t_trans_id); +#ifdef CONFIG_REISERFS_CHECK + if (ei->i_prealloc_count < 0) + reiserfs_error(th->t_super, "zam-4001", + "inode has negative prealloc blocks count."); +#endif + while (ei->i_prealloc_count > 0) { + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); + ei->i_prealloc_block++; + ei->i_prealloc_count--; + dirty = 1; + } + if (dirty) + reiserfs_update_sd(th, inode); + ei->i_prealloc_block = save; + list_del_init(&ei->i_prealloc_list); +} + +/* FIXME: It should be inline function */ +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + struct reiserfs_inode_info *ei = REISERFS_I(inode); + + BUG_ON(!th->t_trans_id); + if (ei->i_prealloc_count) + __discard_prealloc(th, ei); +} + +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) +{ + struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + + BUG_ON(!th->t_trans_id); + while (!list_empty(plist)) { + struct reiserfs_inode_info *ei; + ei = list_entry(plist->next, struct reiserfs_inode_info, + i_prealloc_list); +#ifdef CONFIG_REISERFS_CHECK + if (!ei->i_prealloc_count) { + reiserfs_error(th->t_super, "zam-4001", + "inode is in prealloc list but has " + "no preallocated blocks."); + } +#endif + __discard_prealloc(th, ei); + } +} + +void reiserfs_init_alloc_options(struct super_block *s) +{ + set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); +} + +/* block allocator related options are parsed here */ +int reiserfs_parse_alloc_options(struct super_block *s, char *options) +{ + char *this_char, *value; + + /* clear default settings */ + REISERFS_SB(s)->s_alloc_options.bits = 0; + + while ((this_char = strsep(&options, ":")) != NULL) { + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, "concentrating_formatted_nodes")) { + int temp; + SET_OPTION(concentrating_formatted_nodes); + temp = (value + && *value) ? simple_strtoul(value, &value, + 0) : 10; + if (temp <= 0 || temp > 100) { + REISERFS_SB(s)->s_alloc_options.border = 10; + } else { + REISERFS_SB(s)->s_alloc_options.border = + 100 / temp; + } + continue; + } + if (!strcmp(this_char, "displacing_large_files")) { + SET_OPTION(displacing_large_files); + REISERFS_SB(s)->s_alloc_options.large_file_size = + (value + && *value) ? simple_strtoul(value, &value, 0) : 16; + continue; + } + if (!strcmp(this_char, "displacing_new_packing_localities")) { + SET_OPTION(displacing_new_packing_localities); + continue; + } + + if (!strcmp(this_char, "old_hashed_relocation")) { + SET_OPTION(old_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "new_hashed_relocation")) { + SET_OPTION(new_hashed_relocation); + continue; + } + + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } + if (!strcmp(this_char, "hashed_formatted_nodes")) { + SET_OPTION(hashed_formatted_nodes); + continue; + } + + if (!strcmp(this_char, "skip_busy")) { + SET_OPTION(skip_busy); + continue; + } + + if (!strcmp(this_char, "hundredth_slices")) { + SET_OPTION(hundredth_slices); + continue; + } + + if (!strcmp(this_char, "old_way")) { + SET_OPTION(old_way); + continue; + } + + if (!strcmp(this_char, "displace_based_on_dirid")) { + SET_OPTION(displace_based_on_dirid); + continue; + } + + if (!strcmp(this_char, "preallocmin")) { + REISERFS_SB(s)->s_alloc_options.preallocmin = + (value + && *value) ? simple_strtoul(value, &value, 0) : 4; + continue; + } + + if (!strcmp(this_char, "preallocsize")) { + REISERFS_SB(s)->s_alloc_options.preallocsize = + (value + && *value) ? simple_strtoul(value, &value, + 0) : + PREALLOCATION_SIZE; + continue; + } + + reiserfs_warning(s, "zam-4001", "unknown option - %s", + this_char); + return 1; + } + + reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); + return 0; +} + +static void print_sep(struct seq_file *seq, int *first) +{ + if (!*first) + seq_puts(seq, ":"); + else + *first = 0; +} + +void show_alloc_options(struct seq_file *seq, struct super_block *s) +{ + int first = 1; + + if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | + (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) + return; + + seq_puts(seq, ",alloc="); + + if (TEST_OPTION(concentrating_formatted_nodes, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.border != 10) { + seq_printf(seq, "concentrating_formatted_nodes=%d", + 100 / REISERFS_SB(s)->s_alloc_options.border); + } else + seq_puts(seq, "concentrating_formatted_nodes"); + } + if (TEST_OPTION(displacing_large_files, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { + seq_printf(seq, "displacing_large_files=%lu", + REISERFS_SB(s)->s_alloc_options.large_file_size); + } else + seq_puts(seq, "displacing_large_files"); + } + if (TEST_OPTION(displacing_new_packing_localities, s)) { + print_sep(seq, &first); + seq_puts(seq, "displacing_new_packing_localities"); + } + if (TEST_OPTION(old_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_hashed_relocation"); + } + if (TEST_OPTION(new_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "new_hashed_relocation"); + } + if (TEST_OPTION(dirid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "dirid_groups"); + } + if (TEST_OPTION(oid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "oid_groups"); + } + if (TEST_OPTION(packing_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "packing_groups"); + } + if (TEST_OPTION(hashed_formatted_nodes, s)) { + print_sep(seq, &first); + seq_puts(seq, "hashed_formatted_nodes"); + } + if (TEST_OPTION(skip_busy, s)) { + print_sep(seq, &first); + seq_puts(seq, "skip_busy"); + } + if (TEST_OPTION(hundredth_slices, s)) { + print_sep(seq, &first); + seq_puts(seq, "hundredth_slices"); + } + if (TEST_OPTION(old_way, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_way"); + } + if (TEST_OPTION(displace_based_on_dirid, s)) { + print_sep(seq, &first); + seq_puts(seq, "displace_based_on_dirid"); + } + if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { + print_sep(seq, &first); + seq_printf(seq, "preallocmin=%d", + REISERFS_SB(s)->s_alloc_options.preallocmin); + } + if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { + print_sep(seq, &first); + seq_printf(seq, "preallocsize=%d", + REISERFS_SB(s)->s_alloc_options.preallocsize); + } +} + +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + + if (hint->formatted_node) { + hash_in = (char *)&hint->key.k_dir_id; + } else { + if (!hint->inode) { + /*hint->search_start = hint->beg;*/ + hash_in = (char *)&hint->key.k_dir_id; + } else + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = + (char *)(&INODE_PKEY(hint->inode)->k_objectid); + } + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +/* + * Relocation based on dirid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void dirid_groups(reiserfs_blocknr_hint_t * hint) +{ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; + + if (hint->inode) + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize / 2; + hint->search_start = hash; + } +} + +/* + * Relocation based on oid, hashing them into a given bitmap block + * files. Formatted nodes are unaffected, a separate policy covers them + */ +static void oid_groups(reiserfs_blocknr_hint_t * hint) +{ + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* + * keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; + } +} + +/* + * returns 1 if it finds an indirect item and gets valid hint info + * from it, otherwise 0 + */ +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) +{ + struct treepath *path; + struct buffer_head *bh; + struct item_head *ih; + int pos_in_item; + __le32 *item; + int ret = 0; + + /* + * reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start + */ + if (!hint->path) + return 0; + + path = hint->path; + bh = get_last_bh(path); + RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); + ih = tp_item_head(path); + pos_in_item = path->pos_in_item; + item = tp_item_body(path); + + hint->search_start = bh->b_blocknr; + + /* + * for indirect item: go to left and look for the first non-hole entry + * in the indirect item + */ + if (!hint->formatted_node && is_indirect_le_ih(ih)) { + if (pos_in_item == I_UNFM_NUM(ih)) + pos_in_item--; + while (pos_in_item >= 0) { + int t = get_block_num(item, pos_in_item); + if (t) { + hint->search_start = t; + ret = 1; + break; + } + pos_in_item--; + } + } + + /* does result value fit into specified region? */ + return ret; +} + +/* + * should be, if formatted node, then try to put on first part of the device + * specified as number of percent with mount option device, else try to put + * on last of device. This is not to say it is good code to do so, + * but the effect should be measured. + */ +static inline void set_border_in_hint(struct super_block *s, + reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border = + SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; + + if (hint->formatted_node) + hint->end = border - 1; + else + hint->beg = border; +} + +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) +{ + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), + 4) % (hint->end - hint->beg); + else + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), + 4) % (hint->end - hint->beg); +} + +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) +{ + char *hash_in; + + if (!hint->inode) + hash_in = (char *)&hint->key.k_dir_id; + else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); +} + +static inline int +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * + hint) +{ + return hint->block == + REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; +} + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + + hint->th->displace_new_blocks = 0; + hint->search_start = + hint->beg + keyed_hash((char *)(&key->k_objectid), + 4) % (hint->end - hint->beg); +} +#endif + +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + u32 hash_in; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); + border = + hint->beg + (u32) keyed_hash(((char *)(&hash_in)), + 4) % (hint->end - hint->beg - 1); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline int old_way(reiserfs_blocknr_hint_t * hint) +{ + b_blocknr_t border; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + border = + hint->beg + + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - + hint->beg); + if (border > hint->search_start) + hint->search_start = border; + + return 1; +} + +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + b_blocknr_t slice_start; + + slice_start = + (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); + if (slice_start > hint->search_start + || slice_start + (hint->end / 100) <= hint->search_start) { + hint->search_start = slice_start; + } +} + +static void determine_search_start(reiserfs_blocknr_hint_t * hint, + int amount_needed) +{ + struct super_block *s = hint->th->t_super; + int unfm_hint; + + hint->beg = 0; + hint->end = SB_BLOCK_COUNT(s) - 1; + + /* This is former border algorithm. Now with tunable border offset */ + if (concentrating_formatted_nodes(s)) + set_border_in_hint(s, hint); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * whenever we create a new directory, we displace it. At first + * we will hash for location, later we might look for a moderately + * empty place for it + */ + if (displacing_new_packing_localities(s) + && hint->th->displace_new_blocks) { + displace_new_packing_locality(hint); + + /* + * we do not continue determine_search_start, + * if new packing locality is being displaced + */ + return; + } +#endif + + /* + * all persons should feel encouraged to add more special cases + * here and test them + */ + + if (displacing_large_files(s) && !hint->formatted_node + && this_blocknr_allocation_would_make_it_a_large_file(hint)) { + displace_large_file(hint); + return; + } + + /* + * if none of our special cases is relevant, use the left + * neighbor in the tree order of the new node we are allocating for + */ + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { + hash_formatted_node(hint); + return; + } + + unfm_hint = get_left_neighbor(hint); + + /* + * Mimic old block allocator behaviour, that is if VFS allowed for + * preallocation, new blocks are displaced based on directory ID. + * Also, if suggested search_start is less than last preallocated + * block, we start searching from it, assuming that HDD dataflow + * is faster in forward direction + */ + if (TEST_OPTION(old_way, s)) { + if (!hint->formatted_node) { + if (!reiserfs_hashed_relocation(s)) + old_way(hint); + else if (!reiserfs_no_unhashed_relocation(s)) + old_hashed_relocation(hint); + + if (hint->inode + && hint->search_start < + REISERFS_I(hint->inode)->i_prealloc_block) + hint->search_start = + REISERFS_I(hint->inode)->i_prealloc_block; + } + return; + } + + /* This is an approach proposed by Hans */ + if (TEST_OPTION(hundredth_slices, s) + && !(displacing_large_files(s) && !hint->formatted_node)) { + hundredth_slices(hint); + return; + } + + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) { + old_hashed_relocation(hint); + } + + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) { + new_hashed_relocation(hint); + } + + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } +#endif + + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { + oid_groups(hint); + } + return; +} + +static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) +{ + /* make minimum size a mount option and benchmark both ways */ + /* we preallocate blocks only for regular files, specific size */ + /* benchmark preallocating always and see what happens */ + + hint->prealloc_size = 0; + + if (!hint->formatted_node && hint->preallocate) { + if (S_ISREG(hint->inode->i_mode) + && hint->inode->i_size >= + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocmin * hint->inode->i_sb->s_blocksize) + hint->prealloc_size = + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocsize - 1; + } + return CARRY_ON; +} + +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + b_blocknr_t start, + b_blocknr_t finish, int min, + int amount_needed, + int prealloc_size) +{ + int rest = amount_needed; + int nr_allocated; + + while (rest > 0 && start <= finish) { + nr_allocated = scan_bitmap(hint->th, &start, finish, min, + rest + prealloc_size, + !hint->formatted_node, hint->block); + + if (nr_allocated == 0) /* no new blocks allocated, return */ + break; + + /* fill free_blocknrs array first */ + while (rest > 0 && nr_allocated > 0) { + *new_blocknrs++ = start++; + rest--; + nr_allocated--; + } + + /* do we have something to fill prealloc. array also ? */ + if (nr_allocated > 0) { + /* + * it means prealloc_size was greater that 0 and + * we do preallocation + */ + list_add(&REISERFS_I(hint->inode)->i_prealloc_list, + &SB_JOURNAL(hint->th->t_super)-> + j_prealloc_list); + REISERFS_I(hint->inode)->i_prealloc_block = start; + REISERFS_I(hint->inode)->i_prealloc_count = + nr_allocated; + break; + } + } + + return (amount_needed - rest); +} + +static inline int blocknrs_and_prealloc_arrays_from_search_start + (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, + int amount_needed) { + struct super_block *s = hint->th->t_super; + b_blocknr_t start = hint->search_start; + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; + int passno = 0; + int nr_allocated = 0; + int depth; + + determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating %d blocks id=%u", + amount_needed, hint->inode->i_uid); +#endif + depth = reiserfs_write_unlock_nested(s); + quota_ret = + dquot_alloc_block_nodirty(hint->inode, amount_needed); + if (quota_ret) { /* Quota exceeded? */ + reiserfs_write_lock_nested(s, depth); + return QUOTA_EXCEEDED; + } + if (hint->preallocate && hint->prealloc_size) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating (prealloc) %d blocks id=%u", + hint->prealloc_size, hint->inode->i_uid); +#endif + quota_ret = dquot_prealloc_block_nodirty(hint->inode, + hint->prealloc_size); + if (quota_ret) + hint->preallocate = hint->prealloc_size = 0; + } + /* for unformatted nodes, force large allocations */ + reiserfs_write_lock_nested(s, depth); + } + + do { + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: + /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ + if (!hint->formatted_node) { +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (nospace) %d blocks id=%u", + amount_needed + + hint->prealloc_size - + nr_allocated, + hint->inode->i_uid); +#endif + /* Free not allocated blocks */ + depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(hint->inode, + amount_needed + hint->prealloc_size - + nr_allocated); + reiserfs_write_lock_nested(s, depth); + } + while (nr_allocated--) + reiserfs_free_block(hint->th, hint->inode, + new_blocknrs[nr_allocated], + !hint->formatted_node); + + return NO_DISK_SPACE; + } + } while ((nr_allocated += allocate_without_wrapping_disk(hint, + new_blocknrs + + nr_allocated, + start, finish, + 1, + amount_needed - + nr_allocated, + hint-> + prealloc_size)) + < amount_needed); + if (!hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (failed prealloc) %d blocks id=%u", + amount_needed + hint->prealloc_size - + nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count, + hint->inode->i_uid); +#endif + + depth = reiserfs_write_unlock_nested(s); + dquot_free_block_nodirty(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)-> + i_prealloc_count); + reiserfs_write_lock_nested(s, depth); + } + + return CARRY_ON; +} + +/* grab new blocknrs from preallocated list */ +/* return amount still needed after using them */ +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + int amount_needed) +{ + struct inode *inode = hint->inode; + + if (REISERFS_I(inode)->i_prealloc_count > 0) { + while (amount_needed) { + + *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; + REISERFS_I(inode)->i_prealloc_count--; + + amount_needed--; + + if (REISERFS_I(inode)->i_prealloc_count <= 0) { + list_del(&REISERFS_I(inode)->i_prealloc_list); + break; + } + } + } + /* return amount still needed after using preallocated blocks */ + return amount_needed; +} + +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, + b_blocknr_t *new_blocknrs, + int amount_needed, + /* Amount of blocks we have already reserved */ + int reserved_by_us) +{ + int initial_amount_needed = amount_needed; + int ret; + struct super_block *s = hint->th->t_super; + + /* Check if there is enough space, taking into account reserved space */ + if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < + amount_needed - reserved_by_us) + return NO_DISK_SPACE; + /* should this be if !hint->inode && hint->preallocate? */ + /* do you mean hint->formatted_node can be removed ? - Zam */ + /* + * hint->formatted_node cannot be removed because we try to access + * inode information here, and there is often no inode associated with + * metadata allocations - green + */ + + if (!hint->formatted_node && hint->preallocate) { + amount_needed = use_preallocated_list_if_available + (hint, new_blocknrs, amount_needed); + + /* + * We have all the block numbers we need from the + * prealloc list + */ + if (amount_needed == 0) + return CARRY_ON; + new_blocknrs += (initial_amount_needed - amount_needed); + } + + /* find search start and save it in hint structure */ + determine_search_start(hint, amount_needed); + if (hint->search_start >= SB_BLOCK_COUNT(s)) + hint->search_start = SB_BLOCK_COUNT(s) - 1; + + /* allocation itself; fill new_blocknrs and preallocation arrays */ + ret = blocknrs_and_prealloc_arrays_from_search_start + (hint, new_blocknrs, amount_needed); + + /* + * We used prealloc. list to fill (partially) new_blocknrs array. + * If final allocation fails we need to return blocks back to + * prealloc. list or just free them. -- Zam (I chose second + * variant) + */ + if (ret != CARRY_ON) { + while (amount_needed++ < initial_amount_needed) { + reiserfs_free_block(hint->th, hint->inode, + *(--new_blocknrs), 1); + } + } + return ret; +} + +void reiserfs_cache_bitmap_metadata(struct super_block *sb, + struct buffer_head *bh, + struct reiserfs_bitmap_info *info) +{ + unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); + + /* The first bit must ALWAYS be 1 */ + if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)) + reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is " + "corrupted: first bit must be 1", bh->b_blocknr); + + info->free_count = 0; + + while (--cur >= (unsigned long *)bh->b_data) { + /* 0 and ~0 are special, we can optimize for them */ + if (*cur == 0) + info->free_count += BITS_PER_LONG; + else if (*cur != ~0L) /* A mix, investigate */ + info->free_count += BITS_PER_LONG - hweight_long(*cur); + } +} + +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, + unsigned int bitmap) +{ + b_blocknr_t block = (sb->s_blocksize << 3) * bitmap; + struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; + struct buffer_head *bh; + + /* + * Way old format filesystems had the bitmaps packed up front. + * I doubt there are any of these left, but just in case... + */ + if (unlikely(test_bit(REISERFS_OLD_FORMAT, + &REISERFS_SB(sb)->s_properties))) + block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; + else if (bitmap == 0) + block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; + + bh = sb_bread(sb, block); + if (bh == NULL) + reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " + "reading failed", __func__, block); + else { + if (buffer_locked(bh)) { + int depth; + PROC_INFO_INC(sb, scan_bitmap.wait); + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(bh); + reiserfs_write_lock_nested(sb, depth); + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(atomic_read(&bh->b_count) == 0); + + if (info->free_count == UINT_MAX) + reiserfs_cache_bitmap_metadata(sb, bh, info); + } + + return bh; +} + +int reiserfs_init_bitmap_cache(struct super_block *sb) +{ + struct reiserfs_bitmap_info *bitmap; + unsigned int bmap_nr = reiserfs_bmap_count(sb); + + bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + if (bitmap == NULL) + return -ENOMEM; + + memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); + + SB_AP_BITMAP(sb) = bitmap; + + return 0; +} + +void reiserfs_free_bitmap_cache(struct super_block *sb) +{ + if (SB_AP_BITMAP(sb)) { + vfree(SB_AP_BITMAP(sb)); + SB_AP_BITMAP(sb) = NULL; + } +} diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c new file mode 100644 index 000000000..4a024e2ce --- /dev/null +++ b/fs/reiserfs/dir.c @@ -0,0 +1,346 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include + +extern const struct reiserfs_key MIN_KEY; + +static int reiserfs_readdir(struct file *, struct dir_context *); +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); + +const struct file_operations reiserfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = reiserfs_readdir, + .fsync = reiserfs_dir_fsync, + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif +}; + +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&inode->i_mutex); + if (err < 0) + return err; + return 0; +} + +#define store_ih(where,what) copy_item_head (where, what) + +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) +{ + struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; + return (d_really_is_positive(privroot) && + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); +} + +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) +{ + + /* key of current position in the directory (key of directory entry) */ + struct cpu_key pos_key; + + INITIALIZE_PATH(path_to_entry); + struct buffer_head *bh; + int item_num, entry_num; + const struct reiserfs_key *rkey; + struct item_head *ih, tmp_ih; + int search_res; + char *local_buf; + loff_t next_pos; + char small_buf[32]; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + int ret = 0; + int depth; + + reiserfs_write_lock(inode->i_sb); + + reiserfs_check_lock_depth(inode->i_sb, "readdir"); + + /* + * form key for search the next directory entry using + * f_pos field of file structure + */ + make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); + next_pos = cpu_key_k_offset(&pos_key); + + path_to_entry.reada = PATH_READA; + while (1) { +research: + /* + * search the directory item, containing entry with + * specified key + */ + search_res = + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, + &de); + if (search_res == IO_ERROR) { + /* + * FIXME: we could just skip part of directory + * which could not be read + */ + ret = -EIO; + goto out; + } + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + RFALSE(item_num > B_NR_ITEMS(bh) - 1, + "vs-9005 item_num == %d, item amount == %d", + item_num, B_NR_ITEMS(bh)); + + /* + * and entry must be not more than number of entries + * in the item + */ + RFALSE(ih_entry_count(ih) < entry_num, + "vs-9010: entry number is too big %d (%d)", + entry_num, ih_entry_count(ih)); + + /* + * go through all entries in the directory item beginning + * from the entry, that has been found + */ + if (search_res == POSITION_FOUND + || entry_num < ih_entry_count(ih)) { + struct reiserfs_de_head *deh = + B_I_DEH(bh, ih) + entry_num; + + for (; entry_num < ih_entry_count(ih); + entry_num++, deh++) { + int d_reclen; + char *d_name; + ino_t d_ino; + loff_t cur_pos = deh_offset(deh); + + /* it is hidden entry */ + if (!de_visible(deh)) + continue; + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); + + if (d_reclen <= 0 || + d_name + d_reclen > bh->b_data + bh->b_size) { + /* + * There is corrupted data in entry, + * We'd better stop here + */ + pathrelse(&path_to_entry); + ret = -EIO; + goto out; + } + + if (!d_name[d_reclen - 1]) + d_reclen = strlen(d_name); + + /* too big to send back to VFS */ + if (d_reclen > + REISERFS_MAX_NAME(inode->i_sb-> + s_blocksize)) { + continue; + } + + /* Ignore the .reiserfs_priv entry */ + if (is_privroot_deh(inode, deh)) + continue; + + ctx->pos = deh_offset(deh); + d_ino = deh_objectid(deh); + if (d_reclen <= 32) { + local_buf = small_buf; + } else { + local_buf = kmalloc(d_reclen, + GFP_NOFS); + if (!local_buf) { + pathrelse(&path_to_entry); + ret = -ENOMEM; + goto out; + } + if (item_moved(&tmp_ih, &path_to_entry)) { + kfree(local_buf); + goto research; + } + } + + /* + * Note, that we copy name to user space via + * temporary buffer (local_buf) because + * filldir will block if user space buffer is + * swapped out. At that time entry can move to + * somewhere else + */ + memcpy(local_buf, d_name, d_reclen); + + /* + * Since filldir might sleep, we can release + * the write lock here for other waiters + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + if (!dir_emit + (ctx, local_buf, d_reclen, d_ino, + DT_UNKNOWN)) { + reiserfs_write_lock_nested(inode->i_sb, depth); + if (local_buf != small_buf) { + kfree(local_buf); + } + goto end; + } + reiserfs_write_lock_nested(inode->i_sb, depth); + if (local_buf != small_buf) { + kfree(local_buf); + } + + /* deh_offset(deh) may be invalid now. */ + next_pos = cur_pos + 1; + + if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); + goto research; + } + } /* for */ + } + + /* end of directory has been reached */ + if (item_num != B_NR_ITEMS(bh) - 1) + goto end; + + /* + * item we went through is last item of node. Using right + * delimiting key check is it directory end + */ + rkey = get_rkey(&path_to_entry, inode->i_sb); + if (!comp_le_keys(rkey, &MIN_KEY)) { + /* + * set pos_key to key, that is the smallest and greater + * that key of the last entry in the item + */ + set_cpu_key_k_offset(&pos_key, next_pos); + continue; + } + + /* end of directory has been reached */ + if (COMP_SHORT_KEYS(rkey, &pos_key)) { + goto end; + } + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset(&pos_key, + le_key_k_offset(KEY_FORMAT_3_5, rkey)); + + } /* while */ + +end: + ctx->pos = next_pos; + pathrelse(&path_to_entry); + reiserfs_check_path(&path_to_entry); +out: + reiserfs_write_unlock(inode->i_sb); + return ret; +} + +static int reiserfs_readdir(struct file *file, struct dir_context *ctx) +{ + return reiserfs_readdir_inode(file_inode(file), ctx); +} + +/* + * compose directory item containing "." and ".." entries (entries are + * not aligned to 4 byte boundary) + */ +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *dot, *dotdot; + + memset(body, 0, EMPTY_DIR_SIZE_V1); + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; + + /* direntry header of "." */ + put_deh_offset(dot, DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(dot); + + /* direntry header of ".." */ + put_deh_offset(dotdot, DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - strlen("..")); + mark_de_visible(dotdot); + + /* copy ".." and "." */ + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); +} + +/* compose directory item containing "." and ".." entries */ +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) +{ + struct reiserfs_de_head *dot, *dotdot; + + memset(body, 0, EMPTY_DIR_SIZE); + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; + + /* direntry header of "." */ + put_deh_offset(dot, DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(dot); + + /* direntry header of ".." */ + put_deh_offset(dotdot, DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen(".."))); + mark_de_visible(dotdot); + + /* copy ".." and "." */ + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); +} diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c new file mode 100644 index 000000000..9c02d96d3 --- /dev/null +++ b/fs/reiserfs/do_balan.c @@ -0,0 +1,1911 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Now we have all buffers that must be used in balancing of the tree + * Further calculations can not cause schedule(), and thus the buffer + * tree will be stable until the balancing will be finished + * balance the tree according to the analysis made before, + * and using buffers obtained after all above. + */ + +#include +#include +#include "reiserfs.h" +#include +#include + +static inline void buffer_info_init_left(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->L[0]; + bi->bi_parent = tb->FL[0]; + bi->bi_position = get_left_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_right(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = tb->R[0]; + bi->bi_parent = tb->FR[0]; + bi->bi_position = get_right_neighbor_position(tb, 0); +} + +static inline void buffer_info_init_tbS0(struct tree_balance *tb, + struct buffer_info *bi) +{ + bi->tb = tb; + bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi->bi_position = PATH_H_POSITION(tb->tb_path, 1); +} + +static inline void buffer_info_init_bh(struct tree_balance *tb, + struct buffer_info *bi, + struct buffer_head *bh) +{ + bi->tb = tb; + bi->bi_bh = bh; + bi->bi_parent = NULL; + bi->bi_position = 0; +} + +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag) +{ + journal_mark_dirty(tb->transaction_handle, bh); +} + +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +/* + * summary: + * if deleting something ( tb->insert_size[0] < 0 ) + * return(balance_leaf_when_delete()); (flag d handled here) + * else + * if lnum is larger than 0 we put items into the left node + * if rnum is larger than 0 we put items into the right node + * if snum1 is larger than 0 we put items into the new node s1 + * if snum2 is larger than 0 we put items into the new node s2 + * Note that all *num* count new items being created. + */ + +static void balance_leaf_when_delete_del(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih = item_head(tbS0, item_pos); +#endif + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + buffer_info_init_tbS0(tb, &bi); + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + } + } + + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); +} + +/* cut item in S[0] */ +static void balance_leaf_when_delete_cut(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct item_head *ih = item_head(tbS0, item_pos); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + + if (is_direntry_le_ih(ih)) { + /* + * UFS unlink semantics are such that you can only + * delete one directory entry at a time. + * + * when we cut a directory tb->insert_size[0] means + * number of entries to be cut (always 1) + */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic " + "length of item"); + } +} + +static int balance_leaf_when_delete_left(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* L[0] must be joined with S[0] */ + if (tb->lnum[0] == -1) { + /* R[0] must be also joined with S[0] */ + if (tb->rnum[0] == -1) { + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* + * all contents of all the + * 3 buffers will be in L[0] + */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 && + 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), -1, + NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->R[0]); + + return 0; + } + + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); + + return -1; + } + + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + + /* + * a part of contents of S[0] will be in L[0] and + * the rest part of S[0] will be in R[0] + */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item " + "number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) " + "parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) " + "parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; +} + +/* + * Balance leaf node in case of delete or cut: insert_size[0] < 0 + * + * lnum, rnum can have values >= -1 + * -1 means that the neighbor must be joined with S + * 0 means that nothing should be done with the neighbor + * >0 means to shift entirely or partly the specified number of items + * to the neighbor + */ +static int balance_leaf_when_delete(struct tree_balance *tb, int flag) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; + int n; + struct item_head *ih; + + RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, + "vs- 12000: level: wrong FR %z", tb->FR[0]); + RFALSE(tb->blknum[0] > 1, + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); + RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), + "PAP-12010: tree can not be empty"); + + ih = item_head(tbS0, item_pos); + buffer_info_init_tbS0(tb, &bi); + + /* Delete or truncate the item */ + + BUG_ON(flag != M_DELETE && flag != M_CUT); + if (flag == M_DELETE) + balance_leaf_when_delete_del(tb); + else /* M_CUT */ + balance_leaf_when_delete_cut(tb); + + + /* + * the rule is that no shifting occurs unless by shifting + * a node can be freed + */ + n = B_NR_ITEMS(tbS0); + + + /* L[0] takes part in balancing */ + if (tb->lnum[0]) + return balance_leaf_when_delete_left(tb); + + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} + +static unsigned int balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *const ih, + const char * const body) +{ + int ret; + struct buffer_info bi; + int n = B_NR_ITEMS(tb->L[0]); + unsigned body_shift_bytes = 0; + + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len, shift; + int version; + + ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = ih_item_len(ih) - tb->lbytes; + + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, ih_item_len(ih) - new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: " + "ih_item_len=%d", ih_item_len(ih)); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + min_t(int, tb->zeroes_num, ih_item_len(ih))); + + version = ih_version(ih); + + /* + * Calculate key component, item length and body to + * insert into S[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + + add_le_ih_k_offset(ih, tb->lbytes << shift); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > tb->zeroes_num) { + body_shift_bytes = tb->lbytes - tb->zeroes_num; + tb->zeroes_num = 0; + } else + tb->zeroes_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: " + "ih_item_len=%d", ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + tb->zeroes_num); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; + } + return body_shift_bytes; +} + +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + + RFALSE(tb->zeroes_num, + "PAP-12090: invalid parameter in case of a directory"); + + /* directory item */ + if (tb->lbytes > tb->pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head *pasted; + int ret, l_pos_in_item = tb->pos_in_item; + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 entries from given directory item + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); + if (ret && !tb->item_pos) { + pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); + l_pos_in_item += ih_entry_count(pasted) - + (tb->lbytes - 1); + } + + /* Append given directory entry to directory item */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + l_pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + /* + * previous string prepared space for pasting new entry, + * following string pastes this entry + */ + + /* + * when we have merge directory item, pos_in_item + * has been changed too + */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(&bi, n + tb->item_pos - ret, + l_pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* + * Shift lnum[0]-1 items in whole. Shift lbytes + * directory entries from directory item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + + /* Calculate new position to append in item body */ + tb->pos_in_item -= tb->lbytes; +} + +static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + int body_shift_bytes = 0; + + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_left_shift_dirent(tb, ih, body); + return 0; + } + + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. " + "lbytes=%d", tb->lbytes); + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12100: incorrect position to paste: " + "item_len=%d, pos_in_item=%d", + ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item); + + /* appended item will be in L[0] in whole */ + if (tb->lbytes >= tb->pos_in_item) { + struct item_head *tbS0_pos_ih, *tbL0_ih; + struct item_head *tbS0_0_ih; + struct reiserfs_key *left_delim_key; + int ret, l_n, version, temp_l; + + tbS0_pos_ih = item_head(tbS0, tb->item_pos); + tbS0_0_ih = item_head(tbS0, 0); + + /* + * this bytes number must be appended + * to the last item of L[h] + */ + l_n = tb->lbytes - tb->pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12105: there is nothing to paste into " + "L[0]. insert_size=%d", tb->insert_size[0]); + + ret = leaf_shift_left(tb, tb->lnum[0], + ih_item_len(tbS0_pos_ih)); + + tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + ih_item_len(tbL0_ih), l_n, body, + min_t(int, l_n, tb->zeroes_num)); + + /* + * 0-th item in S0 can be only of DIRECT type + * when l_n != 0 + */ + temp_l = l_n; + + RFALSE(ih_item_len(tbS0_0_ih), + "PAP-12106: item length must be 0"); + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], n + tb->item_pos - ret)), + "PAP-12107: items must be of the same file"); + + if (is_indirect_le_ih(tbL0_ih)) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_l = l_n << shift; + } + /* update key of first item in S0 */ + version = ih_version(tbS0_0_ih); + add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l); + + /* update left delimiting key */ + left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]); + add_le_key_k_offset(version, left_delim_key, temp_l); + + /* + * Calculate new body, position in item and + * insert_size[0] + */ + if (l_n > tb->zeroes_num) { + body_shift_bytes = l_n - tb->zeroes_num; + tb->zeroes_num = 0; + } else + tb->zeroes_num -= l_n; + tb->pos_in_item = 0; + + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], + B_NR_ITEMS(tb->L[0]) - 1)) || + !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable(left_delim_key, tbS0->b_size), + "PAP-12120: item must be merge-able with left " + "neighboring item"); + } else { + /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + tb->pos_in_item -= tb->lbytes; + + RFALSE(tb->pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + tb->pos_in_item); + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + return body_shift_bytes; +} + + +/* appended item will be in L[0] in whole */ +static void balance_leaf_paste_left_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + struct item_head *pasted; + int ret; + + /* if we paste into first item of S[0] and it is left mergable */ + if (!tb->item_pos && + op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) { + /* + * then increment pos_in_item by the size of the + * last item in L[0] + */ + pasted = item_head(tb->L[0], n - 1); + if (is_direntry_le_ih(pasted)) + tb->pos_in_item += ih_entry_count(pasted); + else + tb->pos_in_item += ih_item_len(pasted); + } + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* if appended item is directory, paste entry */ + pasted = item_head(tb->L[0], n + tb->item_pos - ret); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, n + tb->item_pos - ret, + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* + * if appended item is indirect item, put unformatted node + * into un list + */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->insert_size[0] = 0; + tb->zeroes_num = 0; +} + +static unsigned int balance_leaf_paste_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + /* we must shift the part of the appended item */ + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) + return balance_leaf_paste_left_shift(tb, ih, body); + else + balance_leaf_paste_left_whole(tb, ih, body); + return 0; +} + +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ +static unsigned int balance_leaf_left(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + if (tb->lnum[0] <= 0) + return 0; + + /* new item or it part falls to L[0], shift it too */ + if (tb->item_pos < tb->lnum[0]) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + return balance_leaf_insert_left(tb, ih, body); + else /* M_PASTE */ + return balance_leaf_paste_left(tb, ih, body); + } else + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + return 0; +} + + +static void balance_leaf_insert_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int ret; + + /* new item or part of it doesn't fall into R[0] */ + if (n - tb->rnum[0] >= tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* new item or its part falls to R[0] */ + + /* part of new item falls into R[0] */ + if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { + loff_t old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version, shift; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, -1); + + version = ih_version(ih); + + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into R[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + + /* Insert part of the item into R[0] */ + buffer_info_init_right(tb, &bi); + if ((old_len - tb->rbytes) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->rbytes) - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - + (old_len - tb->rbytes); + tb->zeroes_num -= r_zeroes_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + /* + * Calculate key component and item length to + * insert into S[0] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { + /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + + /* Insert new item into R[0] */ + buffer_info_init_right(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1, + ih, body, tb->zeroes_num); + + if (tb->item_pos - n + tb->rnum[0] - 1 == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + + +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + int entry_count; + + RFALSE(tb->zeroes_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = ih_entry_count(item_head(tbS0, tb->item_pos)); + + /* new directory entry falls into R[0] */ + if (entry_count - tb->rbytes < tb->pos_in_item) { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: " + "rbytes=%d, entry_count=%d", tb->rbytes, entry_count); + + /* + * Shift rnum[0]-1 items in whole. + * Shift rbytes-1 directory entries from directory + * item number rnum[0] + */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); + + /* Paste given directory entry to directory item */ + paste_entry_position = tb->pos_in_item - entry_count + + tb->rbytes - 1; + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, 0, paste_entry_position, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, 0, paste_entry_position, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + /* change delimiting keys */ + if (paste_entry_position == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into R[0] */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } +} + +static void balance_leaf_paste_right_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n_shift, n_rem, r_zeroes_number, version; + unsigned long temp_rem; + const char *r_body; + struct buffer_info bi; + + /* we append to directory item */ + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_right_shift_dirent(tb, ih, body); + return; + } + + /* regular object */ + + /* + * Calculate number of bytes which must be shifted + * from appended item + */ + n_shift = tb->rbytes - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, " + "pos_in_item=%d", tb->pos_in_item, + ih_item_len(item_head(tbS0, tb->item_pos))); + + leaf_shift_right(tb, tb->rnum[0], n_shift); + + /* + * Calculate number of bytes which must remain in body + * after appending to R[0] + */ + n_rem = tb->insert_size[0] - tb->rbytes; + if (n_rem < 0) + n_rem = 0; + + temp_rem = n_rem; + + version = ih_version(item_head(tb->R[0], 0)); + + if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_rem = n_rem << shift; + } + + add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem); + add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]), + temp_rem); + + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + buffer_info_init_right(tb, &bi); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + if (is_indirect_le_ih(item_head(tb->R[0], 0))) + set_ih_free_space(item_head(tb->R[0], 0), 0); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_paste_right_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct item_head *pasted; + struct buffer_info bi; + + buffer_info_init_right(tb, &bi); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + /* append item in R[0] */ + if (tb->pos_in_item >= 0) { + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + } + + /* paste new entry, if item is directory item */ + pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) { + leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->pos_in_item) { + + RFALSE(tb->item_pos - n + tb->rnum[0], + "PAP-12165: directory item must be first " + "item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + } + } + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->zeroes_num = tb->insert_size[0] = 0; +} + +static void balance_leaf_paste_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* new item doesn't fall into R[0] */ + if (n - tb->rnum[0] > tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* pasted item or part of it falls to R[0] */ + + if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1) + /* we must shift the part of the appended item */ + balance_leaf_paste_right_shift(tb, ih, body); + else + /* pasted item in whole falls into R[0] */ + balance_leaf_paste_right_whole(tb, ih, body); +} + +/* shift rnum[0] items from S[0] to the right neighbor R[0] */ +static void balance_leaf_right(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + if (tb->rnum[0] <= 0) + return; + + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + balance_leaf_insert_right(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_right(tb, ih, body); +} + +static void balance_leaf_new_nodes_insert(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int shift; + + /* new item or it part don't falls into S_new[i] */ + if (n - tb->snum[i] >= tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* new item or it's part falls to first new node S_new[i] */ + + /* part of new item falls into S_new[i] */ + if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { + int old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, + tb->S_new[i]); + + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into S_new[i] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - tb->sbytes[i]) << shift)); + + put_ih_item_len(ih, tb->sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + + if ((old_len - tb->sbytes[i]) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->sbytes[i]) - + tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - (old_len - + tb->sbytes[i]); + tb->zeroes_num -= r_zeroes_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* + * Calculate key component and item length to + * insert into S[i] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->sbytes[i]); + tb->insert_size[0] -= tb->sbytes[i]; + } else { + /* whole new item falls into S_new[i] */ + + /* + * Shift snum[0] - 1 items to S_new[i] + * (sbytes[i] of split item) + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]); + + /* Insert new item into S_new[i] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1, + ih, body, tb->zeroes_num); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + +/* we append to directory item */ +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int entry_count = ih_entry_count(aux_ih); + struct buffer_info bi; + + if (entry_count - tb->sbytes[i] < tb->pos_in_item && + tb->pos_in_item <= entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb->insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(tb->sbytes[i] - 1 >= entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + tb->sbytes[i] - 1, entry_count); + + /* + * Shift snum[i]-1 items in whole. + * Shift sbytes[i] directory entries + * from directory item number snum[i] + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i] - 1, tb->S_new[i]); + + /* + * Paste given directory entry to + * directory item + */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, tb->insert_size[0], + body, tb->zeroes_num); + + /* paste new directory entry */ + leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + } + +} + +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int n_shift, n_rem, r_zeroes_number, shift; + const char *r_body; + struct item_head *tmp; + struct buffer_info bi; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + if (is_direntry_le_ih(aux_ih)) { + balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key, + insert_ptr, i); + return; + } + + /* regular object */ + + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) || + tb->insert_size[0] <= 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* + * Calculate number of bytes which must be shifted from appended item + */ + n_shift = tb->sbytes[i] - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift, + tb->S_new[i]); + + /* + * Calculate number of bytes which must remain in body after + * append to S_new[i] + */ + n_rem = tb->insert_size[0] - tb->sbytes[i]; + if (n_rem < 0) + n_rem = 0; + + /* Append part of body into S_new[0] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + tmp = item_head(tb->S_new[i], 0); + shift = 0; + if (is_indirect_le_ih(tmp)) { + set_ih_free_space(tmp, 0); + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + } + add_le_ih_k_offset(tmp, n_rem << shift); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) + +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + int leaf_mi; + struct item_head *pasted; + struct buffer_info bi; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih_check = item_head(tbS0, tb->item_pos); + + if (!is_direntry_le_ih(ih_check) && + (tb->pos_in_item != ih_item_len(ih_check) || + tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235", + "pos_in_item must be equal to ih_item_len"); +#endif + + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + + RFALSE(leaf_mi, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + leaf_mi); + + /* paste into item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + pasted = item_head(tb->S_new[i], tb->item_pos - n + + tb->snum[i]); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + +} +static void balance_leaf_new_nodes_paste(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* pasted item doesn't fall into S_new[i] */ + if (n - tb->snum[i] > tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* pasted item or part if it falls to S_new[i] */ + + if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1) + /* we must shift part of the appended item */ + balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key, + insert_ptr, i); + else + /* item falls wholly into S_new[i] */ + balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key, + insert_ptr, i); +} + +/* Fill new nodes that appear in place of S[0] */ +static void balance_leaf_new_nodes(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int flag) +{ + int i; + for (i = tb->blknum[0] - 2; i >= 0; i--) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + RFALSE(!tb->snum[i], + "PAP-12200: snum[%d] == %d. Must be > 0", i, + tb->snum[i]); + + /* here we shift from S to S_new nodes */ + + tb->S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL); + + if (flag == M_INSERT) + balance_leaf_new_nodes_insert(tb, ih, body, insert_key, + insert_ptr, i); + else /* M_PASTE */ + balance_leaf_new_nodes_paste(tb, ih, body, insert_key, + insert_ptr, i); + + memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE); + insert_ptr[i] = tb->S_new[i]; + + RFALSE(!buffer_journaled(tb->S_new[i]) + || buffer_journal_dirty(tb->S_new[i]) + || buffer_dirty(tb->S_new[i]), + "PAP-12247: S_new[%d] : (%b)", + i, tb->S_new[i]); + } +} + +static void balance_leaf_finish_node_insert(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num); + + /* If we insert the first key change the delimiting key */ + if (tb->item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + + } +} + +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *pasted = item_head(tbS0, tb->item_pos); + struct buffer_info bi; + + if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) { + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->item_pos && !tb->pos_in_item) { + RFALSE(!tb->CFL[0] || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); + } + + tb->insert_size[0] = 0; + } +} + +static void balance_leaf_finish_node_paste(struct tree_balance *tb, + struct item_head * const ih, + const char * const body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + struct item_head *pasted = item_head(tbS0, tb->item_pos); + + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + balance_leaf_finish_node_paste_dirent(tb, ih, body); + return; + } + + /* regular object */ + + if (tb->pos_in_item == ih_item_len(pasted)) { + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->insert_size[0] = 0; + } +#ifdef CONFIG_REISERFS_CHECK + else if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb->tb_sb, "PAP-12285", + "insert_size must be 0 (%d)", tb->insert_size[0]); + } +#endif +} + +/* + * if the affected item was not wholly shifted then we + * perform all necessary operations on that part or whole + * of the affected item which remains in S + */ +static void balance_leaf_finish_node(struct tree_balance *tb, + struct item_head * const ih, + const char * const body, int flag) +{ + /* if we must insert or append into buffer S[0] */ + if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { + if (flag == M_INSERT) + balance_leaf_finish_node_insert(tb, ih, body); + else /* M_PASTE */ + balance_leaf_finish_node_paste(tb, ih, body); + } +} + +/** + * balance_leaf - reiserfs tree balancing algorithm + * @tb: tree balance state + * @ih: item header of inserted item (little endian) + * @body: body of inserted item or bytes to paste + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance) + * passed back: + * @insert_key: key to insert new nodes + * @insert_ptr: array of nodes to insert at the next level + * + * In our processing of one level we sometimes determine what must be + * inserted into the next higher level. This insertion consists of a + * key or two keys and their corresponding pointers. + */ +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag, + struct item_head *insert_key, + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + tb->item_pos = PATH_LAST_POSITION(tb->tb_path), + tb->pos_in_item = tb->tb_path->pos_in_item, + tb->zeroes_num = 0; + if (flag == M_INSERT && !body) + tb->zeroes_num = ih_item_len(ih); + + /* + * for indirect item pos_in_item is measured in unformatted node + * pointers. Recalculate to bytes + */ + if (flag != M_INSERT + && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) + tb->pos_in_item *= UNFM_P_SIZE; + + body += balance_leaf_left(tb, ih, body, flag); + + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + balance_leaf_right(tb, ih, body, flag); + + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + + /* + * if while adding to a node we discover that it is possible to split + * it in two, and merge the left part into the left neighbor and the + * right part into the right neighbor, eliminating the node + */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* + * if insertion was done before 0-th position in R[0], right + * delimiting key of the tb->L[0]'s and left delimiting key are + * not set correctly + */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, "vs-12195", + "CFR not initialized"); + copy_key(internal_key(tb->CFL[0], tb->lkey[0]), + internal_key(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); + } + + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } + + balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag); + + balance_leaf_finish_node(tb, ih, body, flag); + +#ifdef CONFIG_REISERFS_CHECK + if (flag == M_PASTE && tb->insert_size[0]) { + print_cur_tb("12290"); + reiserfs_panic(tb->tb_sb, + "PAP-12290", "insert_size is still not 0 (%d)", + tb->insert_size[0]); + } +#endif + + /* Leaf level of the tree is balanced (end of balance_leaf) */ + return 0; +} + +/* Make empty node */ +void make_empty_node(struct buffer_info *bi) +{ + struct block_head *blkh; + + RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); + + blkh = B_BLK_HEAD(bi->bi_bh); + set_blkh_nr_item(blkh, 0); + set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh)); + + if (bi->bi_parent) + B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ +} + +/* Get first empty buffer */ +struct buffer_head *get_FEB(struct tree_balance *tb) +{ + int i; + struct buffer_info bi; + + for (i = 0; i < MAX_FEB_SIZE; i++) + if (tb->FEB[i] != NULL) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty"); + + buffer_info_init_bh(tb, &bi, tb->FEB[i]); + make_empty_node(&bi); + set_buffer_uptodate(tb->FEB[i]); + tb->used[i] = tb->FEB[i]; + tb->FEB[i] = NULL; + + return tb->used[i]; +} + +/* This is now used because reiserfs_free_block has to be able to schedule. */ +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) +{ + int i; + + if (buffer_dirty(bh)) + reiserfs_warning(tb->tb_sb, "reiserfs-12320", + "called with dirty buffer"); + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + get_bh(bh); /* free_thrown puts this */ + return; + } + reiserfs_warning(tb->tb_sb, "reiserfs-12321", + "too many thrown buffers"); +} + +static void free_thrown(struct tree_balance *tb) +{ + int i; + b_blocknr_t blocknr; + for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr; + if (buffer_dirty(tb->thrown[i])) + reiserfs_warning(tb->tb_sb, "reiserfs-12322", + "called with dirty buffer %d", + blocknr); + brelse(tb->thrown[i]); /* incremented in store_thrown */ + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + } +} + +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh) +{ + struct block_head *blkh; + blkh = B_BLK_HEAD(bh); + set_blkh_level(blkh, FREE_LEVEL); + set_blkh_nr_item(blkh, 0); + + clear_buffer_dirty(bh); + store_thrown(tb, bh); +} + +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, + struct buffer_head *src, int n_src) +{ + + RFALSE(dest == NULL || src == NULL, + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", + src, dest); + RFALSE(!B_IS_KEYS_LEVEL(dest), + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", + dest); + RFALSE(n_dest < 0 || n_src < 0, + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); + RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); + + if (B_IS_ITEMS_LEVEL(src)) + /* source buffer contains leaf node */ + memcpy(internal_key(dest, n_dest), item_head(src, n_src), + KEY_SIZE); + else + memcpy(internal_key(dest, n_dest), internal_key(src, n_src), + KEY_SIZE); + + do_balance_mark_internal_dirty(tb, dest, 0); +} + +int get_left_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL, + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h)); + + if (Sh_position == 0) + return B_NR_ITEMS(tb->FL[h]); + else + return Sh_position - 1; +} + +int get_right_neighbor_position(struct tree_balance *tb, int h) +{ + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); + + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL, + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]); + + if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h))) + return 0; + else + return Sh_position + 1; +} + +#ifdef CONFIG_REISERFS_CHECK + +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +static void check_internal_node(struct super_block *s, struct buffer_head *bh, + char *mes) +{ + struct disk_child *dc; + int i; + + RFALSE(!bh, "PAP-12336: bh == 0"); + + if (!bh || !B_IS_IN_TREE(bh)) + return; + + RFALSE(!buffer_dirty(bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh)), + "PAP-12337: buffer (%b) must be dirty", bh); + dc = B_N_CHILD(bh, 0); + + for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { + if (!is_reusable(s, dc_block_number(dc), 1)) { + print_cur_tb(mes); + reiserfs_panic(s, "PAP-12338", + "invalid child pointer %y in %b", + dc, bh); + } + } +} + +static int locked_or_not_in_tree(struct tree_balance *tb, + struct buffer_head *bh, char *which) +{ + if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || + !B_IS_IN_TREE(bh)) { + reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh); + return 1; + } + return 0; +} + +static int check_before_balancing(struct tree_balance *tb) +{ + int retval = 0; + + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " + "occurred based on cur_tb not being null at " + "this point in code. do_balance cannot properly " + "handle concurrent tree accesses on a same " + "mount point."); + } + + /* + * double check that buffers that we will modify are unlocked. + * (fix_nodes should already have prepped all of these for us). + */ + if (tb->lnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]"); + check_leaf(tb->L[0]); + } + if (tb->rnum[0]) { + retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]"); + check_leaf(tb->R[0]); + } + retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path), + "S[0]"); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); + + return retval; +} + +static void check_after_balance_leaf(struct tree_balance *tb) +{ + if (tb->lnum[0]) { + if (B_FREE_SPACE(tb->L[0]) != + MAX_CHILD_SIZE(tb->L[0]) - + dc_size(B_N_CHILD + (tb->FL[0], get_left_neighbor_position(tb, 0)))) { + print_cur_tb("12221"); + reiserfs_panic(tb->tb_sb, "PAP-12355", + "shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE(tb->R[0]) != + MAX_CHILD_SIZE(tb->R[0]) - + dc_size(B_N_CHILD + (tb->FR[0], get_right_neighbor_position(tb, 0)))) { + print_cur_tb("12222"); + reiserfs_panic(tb->tb_sb, "PAP-12360", + "shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path, 1) && + (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) != + (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1)))))) { + int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)); + int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, + 1)))); + print_cur_tb("12223"); + reiserfs_warning(tb->tb_sb, "reiserfs-12363", + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", + left, + MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)), + PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1), + dc_size(B_N_CHILD + (PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1))), + right); + reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect"); + } +} + +static void check_leaf_level(struct tree_balance *tb) +{ + check_leaf(tb->L[0]); + check_leaf(tb->R[0]); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); +} + +static void check_internal_levels(struct tree_balance *tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h++) { + check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h), + "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node(tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node(tb->tb_sb, tb->R[h], "BAD R"); + } + +} + +#endif + +/* + * Now we have all of the buffers that must be used in balancing of + * the tree. We rely on the assumption that schedule() will not occur + * while do_balance works. ( Only interrupt handlers are acceptable.) + * We balance the tree according to the analysis made before this, + * using buffers already obtained. For SMP support it will someday be + * necessary to add ordered locking of tb. + */ + +/* + * Some interesting rules of balancing: + * we delete a maximum of two nodes per level per balancing: we never + * delete R, when we delete two of three nodes L, S, R then we move + * them into R. + * + * we only delete L if we are deleting two nodes, if we delete only + * one node we delete S + * + * if we shift leaves then we shift as much as we can: this is a + * deliberate policy of extremism in node packing which results in + * higher average utilization after repeated random balance operations + * at the cost of more memory copies and more balancing as a result of + * small insertions to full nodes. + * + * if we shift internal nodes we try to evenly balance the node + * utilization, with consequent less balancing at the cost of lower + * utilization. + * + * one could argue that the policy for directories in leaves should be + * that of internal nodes, but we will wait until another day to + * evaluate this.... It would be nice to someday measure and prove + * these assumptions as to what is optimal.... + */ + +static inline void do_balance_starts(struct tree_balance *tb) +{ + /* use print_cur_tb() to see initial state of struct tree_balance */ + + /* store_print_tb (tb); */ + + /* do not delete, just comment it out */ + /* + print_tb(flag, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item, tb, "check"); + */ + RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); +#ifdef CONFIG_REISERFS_CHECK + REISERFS_SB(tb->tb_sb)->cur_tb = tb; +#endif +} + +static inline void do_balance_completed(struct tree_balance *tb) +{ + +#ifdef CONFIG_REISERFS_CHECK + check_leaf_level(tb); + check_internal_levels(tb); + REISERFS_SB(tb->tb_sb)->cur_tb = NULL; +#endif + + /* + * reiserfs_free_block is no longer schedule safe. So, we need to + * put the buffers we want freed on the thrown list during do_balance, + * and then free them now + */ + + REISERFS_SB(tb->tb_sb)->s_do_balance++; + + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); + + free_thrown(tb); +} + +/* + * do_balance - balance the tree + * + * @tb: tree_balance structure + * @ih: item header of inserted item + * @body: body of inserted item or bytes to paste + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste + * + * Cut means delete part of an item (includes removing an entry from a + * directory). + * + * Delete means delete whole item. + * + * Insert means add a new item into the tree. + * + * Paste means to append to the end of an existing file or to + * insert a directory entry. + */ +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + int child_pos; /* position of a child node in its parent */ + int h; /* level of the tree being processed */ + + /* + * in our processing of one level we sometimes determine what + * must be inserted into the next higher level. This insertion + * consists of a key or two keys and their corresponding + * pointers + */ + struct item_head insert_key[2]; + + /* inserted node-ptrs for the next level */ + struct buffer_head *insert_ptr[2]; + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has " + "changed"); + } + /* if we have no real work to do */ + if (!tb->insert_size[0]) { + reiserfs_warning(tb->tb_sb, "PAP-12350", + "insert_size == 0, mode == %c", flag); + unfix_nodes(tb); + return; + } + + atomic_inc(&fs_generation(tb->tb_sb)); + do_balance_starts(tb); + + /* + * balance_leaf returns 0 except if combining L R and S into + * one node. see balance_internal() for explanation of this + * line of code. + */ + child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); + +#ifdef CONFIG_REISERFS_CHECK + check_after_balance_leaf(tb); +#endif + + /* Balance internal level of the tree. */ + for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) + child_pos = balance_internal(tb, h, child_pos, insert_key, + insert_ptr); + + do_balance_completed(tb); +} diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c new file mode 100644 index 000000000..96a1bcf33 --- /dev/null +++ b/fs/reiserfs/file.c @@ -0,0 +1,270 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include + +/* + * We pack the tails of files on file close, not at the time they are written. + * This implies an unnecessary copy of the tail and an unnecessary indirect item + * insertion/balancing, for files that are written in one write. + * It avoids unnecessary tail packings (balances) for files that are written in + * multiple writes and are small enough to have tails. + * + * file_release is called by the VFS layer when the file is closed. If + * this is the last open file descriptor, and the file + * small enough to have a tail, and the tail is currently in an + * unformatted node, the tail is converted back into a direct item. + * + * We use reiserfs_truncate_file to pack the tail, since it already has + * all the conditions coded. + */ +static int reiserfs_file_release(struct inode *inode, struct file *filp) +{ + + struct reiserfs_transaction_handle th; + int err; + int jbegin_failure = 0; + + BUG_ON(!S_ISREG(inode->i_mode)); + + if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) + return 0; + + mutex_lock(&REISERFS_I(inode)->tailpack); + + if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { + mutex_unlock(&REISERFS_I(inode)->tailpack); + return 0; + } + + /* fast out for when nothing needs to be done */ + if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || + !tail_has_to_be_packed(inode)) && + REISERFS_I(inode)->i_prealloc_count <= 0) { + mutex_unlock(&REISERFS_I(inode)->tailpack); + return 0; + } + + reiserfs_write_lock(inode->i_sb); + /* + * freeing preallocation only involves relogging blocks that + * are already in the current transaction. preallocation gets + * freed at the end of each transaction, so it is impossible for + * us to log any additional blocks (including quota blocks) + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + /* + * uh oh, we can't allow the inode to go away while there + * is still preallocation blocks pending. Try to join the + * aborted transaction + */ + jbegin_failure = err; + err = journal_join_abort(&th, inode->i_sb); + + if (err) { + /* + * hmpf, our choices here aren't good. We can pin + * the inode which will disallow unmount from ever + * happening, we can do nothing, which will corrupt + * random memory on unmount, or we can forcibly + * remove the file from the preallocation list, which + * will leak blocks on disk. Lets pin the inode + * and let the admin know what is going on. + */ + igrab(inode); + reiserfs_warning(inode->i_sb, "clm-9001", + "pinning inode %lu because the " + "preallocation can't be freed", + inode->i_ino); + goto out; + } + } + reiserfs_update_inode_transaction(inode); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc(&th, inode); +#endif + err = journal_end(&th); + + /* copy back the error code from journal_begin */ + if (!err) + err = jbegin_failure; + + if (!err && + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && + tail_has_to_be_packed(inode)) { + + /* + * if regular file is released by last holder and it has been + * appended (we append by unformatted node only) or its direct + * item(s) had to be converted, then it may have to be + * indirect2direct converted + */ + err = reiserfs_truncate_file(inode, 0); + } +out: + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&REISERFS_I(inode)->tailpack); + return err; +} + +static int reiserfs_file_open(struct inode *inode, struct file *file) +{ + int err = dquot_file_open(inode, file); + + /* somebody might be tailpacking on final close; wait for it */ + if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { + mutex_lock(&REISERFS_I(inode)->tailpack); + atomic_inc(&REISERFS_I(inode)->openers); + mutex_unlock(&REISERFS_I(inode)->tailpack); + } + return err; +} + +void reiserfs_vfs_truncate_file(struct inode *inode) +{ + mutex_lock(&REISERFS_I(inode)->tailpack); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); +} + +/* Sync a reiserfs file. */ + +/* + * FIXME: sync_mapping_buffers() never has anything to sync. Can + * be removed... + */ + +static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + int barrier_done; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + BUG_ON(!S_ISREG(inode->i_mode)); + err = sync_mapping_buffers(inode->i_mapping); + reiserfs_write_lock(inode->i_sb); + barrier_done = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); + if (barrier_done < 0) + return barrier_done; + return (err < 0) ? -EIO : 0; +} + +/* taken fs/buffer.c:__block_commit_write */ +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + int logit = reiserfs_file_data_log(inode); + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + struct reiserfs_transaction_handle th; + int ret = 0; + + th.t_trans_id = 0; + blocksize = 1 << inode->i_blkbits; + + if (logit) { + reiserfs_write_lock(s); + ret = journal_begin(&th, s, bh_per_page + 1); + if (ret) + goto drop_write_lock; + reiserfs_update_inode_transaction(inode); + } + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (logit) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, bh); + } else if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* + * do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } + } + if (logit) { + ret = journal_end(&th); +drop_write_lock: + reiserfs_write_unlock(s); + } + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return ret; +} + +const struct file_operations reiserfs_file_operations = { + .unlocked_ioctl = reiserfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = reiserfs_compat_ioctl, +#endif + .mmap = generic_file_mmap, + .open = reiserfs_file_open, + .release = reiserfs_file_release, + .fsync = reiserfs_sync_file, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .llseek = generic_file_llseek, +}; + +const struct inode_operations reiserfs_file_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c new file mode 100644 index 000000000..6b0ddb2a9 --- /dev/null +++ b/fs/reiserfs/fix_node.c @@ -0,0 +1,2825 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* + * To make any changes in the tree we find a node that contains item + * to be changed/deleted or position in the node we insert a new item + * to. We call this node S. To do balancing we need to decide what we + * will shift to left/right neighbor, or to a new node, where new item + * will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ + +/* + * Takes item number in virtual node, returns number of item + * that it has in source buffer + */ +static inline int old_item_num(int new_num, int affected_item_num, int mode) +{ + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; + + if (mode == M_INSERT) { + + RFALSE(new_num == 0, + "vs-8005: for INSERT mode and item number of inserted item"); + + return new_num - 1; + } + + RFALSE(mode != M_DELETE, + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", + mode); + /* delete mode */ + return new_num + 1; +} + +static void create_virtual_node(struct tree_balance *tb, int h) +{ + struct item_head *ih; + struct virtual_node *vn = tb->tb_vn; + int new_num; + struct buffer_head *Sh; /* this comes from tb->S[h] */ + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + + /* size of changed node */ + vn->vn_size = + MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; + + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; + } + + /* number of items in virtual node */ + vn->vn_nr_item = + B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - + ((vn->vn_mode == M_DELETE) ? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); + + /* first item in the node */ + ih = item_head(Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ + if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) + && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* + * go through all items that remain in the virtual + * node (except for the new (inserted) one) + */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { + int j; + struct virtual_item *vi = vn->vn_vi + new_num; + int is_affected = + ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num(new_num, vn->vn_affected_item_num, + vn->vn_mode); + + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = ih_item_body(Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + /* + * FIXME: there is no check that item operation did not + * consume too much memory + */ + vn->vn_free_ptr += + op_create_vi(vn, vi, is_affected, tb->insert_size[0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic(tb->tb_sb, "vs-8030", + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + /* pointer to data which is going to be pasted */ + vi->vi_new_data = vn->vn_data; + } + } + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; + + RFALSE(vn->vn_ins_ih == NULL, + "vs-8040: item header of inserted item is not specified"); + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi(vn, vi, 0 /*not pasted or cut */ , + tb->insert_size[0]); + } + + /* + * set right merge flag we take right delimiting key and + * check whether it is a mergeable item + */ + if (tb->CFR[0]) { + struct reiserfs_key *key; + + key = internal_key(tb->CFR[0], tb->rkey[0]); + if (op_is_left_mergeable(key, Sh->b_size) + && (vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) + vn->vn_vi[vn->vn_nr_item - 1].vi_type |= + VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable(key, Sh->b_size) && + !(vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { + /* + * we delete last item and it could be merged + * with right neighbor's first item + */ + if (! + (B_NR_ITEMS(Sh) == 1 + && is_direntry_le_ih(item_head(Sh, 0)) + && ih_entry_count(item_head(Sh, 0)) == 1)) { + /* + * node contains more than 1 item, or item + * is not directory item, or this item + * contains more than 1 entry + */ + print_block(Sh, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "vs-8045", + "rdkey %k, affected item==%d " + "(mode==%c) Must be %c", + key, vn->vn_affected_item_num, + vn->vn_mode, M_DELETE); + } + } +#endif + + } +} + +/* + * Using virtual node check, how many items can be + * shifted to left neighbor + */ +static void check_left(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); + + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8055: parent does not exist or invalid"); + + vi = vn->vn_vi; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8055: invalid mode or balance condition failed"); + + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; + i++, ih_size = IH_SIZE, d_size = 0, vi++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0]++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* + * check whether L[0] can hold ih and at least one byte + * of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left(vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0]++; + + break; + } + + return; +} + +/* + * Using virtual node check, how many items can be + * shifted to right neighbor + */ +static void check_right(struct tree_balance *tb, int h, int cur_free) +{ + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; + } + + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8075: parent does not exist or invalid"); + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8080: invalid mode or balance condition failed"); + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; + i--, d_size = 0, ih_size = IH_SIZE, vi--) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0]++; + continue; + } + + /* + * check whether R[0] can hold ih and at least one + * byte of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { + tb->rbytes = -1; + return; + } + + /* + * R[0] can hold the header of the item and at least + * one byte of its body + */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right(vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0]++; + + break; + } + + return; +} + +/* + * from - number of items, which are shifted to left neighbor entirely + * to - number of item, which are shifted to right neighbor entirely + * from_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to right neighbor + */ +static int get_num_ver(int mode, struct tree_balance *tb, int h, + int from, int from_bytes, + int to, int to_bytes, short *snum012, int flow) +{ + int i; + int cur_free; + int units; + struct virtual_node *vn = tb->tb_vn; + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + + /* position of item we start filling node from */ + int start_item; + + /* position of item we finish filling node by */ + int end_item; + + /* + * number of first bytes (entries for directory) of start_item-th item + * we do not include into node that is being filled + */ + int start_bytes; + + /* + * number of last bytes (entries for directory) of end_item-th item + * we do node include into node that is being filled + */ + int end_bytes; + + /* + * these are positions in virtual item of items, that are split + * between S[0] and S1new and S1new and S2new + */ + int split_item_positions[2]; + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + + /* + * We only create additional nodes if we are in insert or paste mode + * or we are in replace mode at the internal level. If h is 0 and + * the mode is M_REPLACE then in fix_nodes we change the mode to + * paste or insert before we get here in the code. + */ + RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), + "vs-8100: insert_size < 0 in overflow"); + + max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); + + /* + * snum012 [0-2] - number of items, that lay + * to S[0], first new node and second new node + */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); + } + + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + /* start from 'from'-th item */ + start_item = from; + /* skip its first 'start_bytes' units */ + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + /* last included item is the 'end_item'-th one */ + end_item = vn->vn_nr_item - to - 1; + /* do not count last 'end_bytes' units of 'end_item'-th item */ + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* + * go through all item beginning from the start_item-th item + * and ending by the end_item-th item. Do not count first + * 'start_bytes' units of 'start_item'-th item and last + * 'end_bytes' of 'end_item'-th item + */ + for (i = start_item; i <= end_item; i++) { + struct virtual_item *vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + + RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* + * do not take in calculation head part (from_bytes) + * of from-th item + */ + current_item_size -= + op_part_size(vi, 0 /*from start */ , start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= + op_part_size(vi, 1 /*from end */ , skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1]++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + /* + * virtual item length is longer, than max size of item in + * a node. It is impossible for direct item + */ + if (current_item_size > max_node_size) { + RFALSE(is_direct_le_ih(vi->vi_ih), + "vs-8110: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); + /* we will try to split it */ + flow = 1; + } + + /* as we do not split items, take new node and continue */ + if (!flow) { + needed_nodes++; + i--; + total_node_size = 0; + continue; + } + + /* + * calculate number of item units which fit into node being + * filled + */ + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = + op_check_left(vi, free_space, start_bytes, + skip_from_end); + /* + * nothing fits into current node, take new + * node and continue + */ + if (units == -1) { + needed_nodes++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning(tb->tb_sb, "vs-8111", + "split_item_position is out of range"); + snum012[needed_nodes - 1]++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i--; + total_node_size = 0; + } + + /* + * sum012[4] (if it is not -1) contains number of units of which + * are to be in S1new, snum012[3] - to be in S0. They are supposed + * to be S1bytes and S2bytes correspondingly, so recalculate + */ + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = + ((split_item_positions[0] == + split_item_positions[1]) ? snum012[3] : 0); + + /* s2bytes */ + snum012[4] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - + bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) + reiserfs_warning(tb->tb_sb, "vs-8115", + "not directory or indirect item"); + } + + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = + ((split_item_positions[0] == split_item_positions[1] + && snum012[4] != -1) ? snum012[4] : 0); + + /* s1bytes */ + snum012[3] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - + bytes_to_r - bytes_to_l - bytes_to_S2new; + } + + return needed_nodes; +} + + +/* + * Set parameters for balancing. + * Performs write of results of analysis of balancing into structure tb, + * where it will later be used by the functions that actually do the balancing. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * lnum number of items from S[h] that must be shifted to L[h]; + * rnum number of items from S[h] that must be shifted to R[h]; + * blk_num number of blocks that S[h] will be splitted into; + * s012 number of items that fall into splitted nodes. + * lbytes number of bytes which flow to the left neighbor from the + * item that is not not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the + * item that is not not shifted entirely + * s1bytes number of bytes which flow to the first new node when + * S[0] splits (this number is contained in s012 array) + */ + +static void set_parameters(struct tree_balance *tb, int h, int lnum, + int rnum, int blk_num, short *s012, int lb, int rb) +{ + + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; + + /* only for leaf level */ + if (h == 0) { + if (s012 != NULL) { + tb->s0num = *s012++; + tb->snum[0] = *s012++; + tb->snum[1] = *s012++; + tb->sbytes[0] = *s012++; + tb->sbytes[1] = *s012; + } + tb->lbytes = lb; + tb->rbytes = rb; + } + PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); + PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); + + PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); + PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); +} + +/* + * check if node disappears if we shift tb->lnum[0] items to left + * neighbor and tb->rnum[0] to the right one. + */ +static int is_leaf_removable(struct tree_balance *tb) +{ + struct virtual_node *vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* + * number of items that will be shifted to left (right) neighbor + * entirely + */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + /* all content of node can be shifted to neighbors */ + if (remain_items < 1) { + set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, + NULL, -1, -1); + return 1; + } + + /* S[0] is not removable */ + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + return 0; + + /* check whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num(&vn->vn_vi[to_left]); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, + tb->lbytes, -1); + return 1; + } + + return 0; +} + +/* check whether L, S, R can be joined in one node */ +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) +{ + struct virtual_node *vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item - 1]. + vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head *ih; + + RFALSE(B_NR_ITEMS(S0) != 1, + "vs-8125: item number must be 1: it is %d", + B_NR_ITEMS(S0)); + + ih = item_head(S0, 0); + if (tb->CFR[0] + && !comp_short_le_keys(&ih->ih_key, + internal_key(tb->CFR[0], + tb->rkey[0]))) + /* + * Directory must be in correct state here: that is + * somewhere at the left side should exist first + * directory item. But the item being deleted can + * not be that first one because its right neighbor + * is item of the same directory. (But first item + * always gets deleted in last turn). So, neighbors + * of deleted item can be merged, so we can save + * ih_size + */ + if (is_direntry_le_ih(ih)) { + ih_size = IH_SIZE; + + /* + * we might check that left neighbor exists + * and is of the same directory + */ + RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, + "vs-8130: first directory item can not be removed until directory is not empty"); + } + + } + + if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); + PROC_INFO_INC(tb->tb_sb, leaves_removable); + return 1; + } + return 0; + +} + +/* when we do not split item, lnum and rnum are numbers of entire items */ +#define SET_PAR_SHIFT_LEFT \ +if (h)\ +{\ + int to_l;\ + \ + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ + (MAX_NR_KEY(Sh) + 1 - lpar);\ + \ + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (lset==LEFT_SHIFT_FLOW)\ + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ + tb->lbytes, -1);\ + else\ + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ + -1, -1);\ +} + +#define SET_PAR_SHIFT_RIGHT \ +if (h)\ +{\ + int to_r;\ + \ + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ + \ + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (rset==RIGHT_SHIFT_FLOW)\ + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ + -1, tb->rbytes);\ + else\ + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ + -1, -1);\ +} + +static void free_buffers_in_tb(struct tree_balance *tb) +{ + int i; + + pathrelse(tb->tb_path); + + for (i = 0; i < MAX_HEIGHT; i++) { + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } +} + +/* + * Get new buffers for storing new nodes that are created while balancing. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + * NO_DISK_SPACE - no disk space. + */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_empty_nodes(struct tree_balance *tb, int h) +{ + struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); + b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; + int counter, number_of_freeblk; + int amount_needed; /* number of needed empty blocks */ + int retval = CARRY_ON; + struct super_block *sb = tb->tb_sb; + + /* + * number_of_freeblk is the number of empty blocks which have been + * acquired for use by the balancing algorithm minus the number of + * empty blocks used in the previous levels of the analysis, + * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule + * occurs after empty blocks are acquired, and the balancing analysis + * is then restarted, amount_needed is the number needed by this + * level (h) of the balancing analysis. + * + * Note that for systems with many processes writing, it would be + * more layout optimal to calculate the total number needed by all + * levels and then to run reiserfs_new_blocks to get all of them at + * once. + */ + + /* + * Initiate number_of_freeblk to the amount acquired prior to the + * restart of the analysis or 0 if not restarted, then subtract the + * amount needed by all of the levels of the tree below h. + */ + /* blknum includes S[h], so we subtract 1 in this calculation */ + for (counter = 0, number_of_freeblk = tb->cur_blknum; + counter < h; counter++) + number_of_freeblk -= + (tb->blknum[counter]) ? (tb->blknum[counter] - + 1) : 0; + + /* Allocate missing empty blocks. */ + /* if Sh == 0 then we are getting a new root */ + amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; + /* + * Amount_needed = the amount that we need more than the + * amount that we have. + */ + if (amount_needed > number_of_freeblk) + amount_needed -= number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + /* + * No need to check quota - is not allocated for blocks used + * for formatted nodes + */ + if (reiserfs_new_form_blocknrs(tb, blocknrs, + amount_needed) == NO_DISK_SPACE) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for (blocknr = blocknrs, counter = 0; + counter < amount_needed; blocknr++, counter++) { + + RFALSE(!*blocknr, + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); + + new_bh = sb_getblk(sb, *blocknr); + RFALSE(buffer_dirty(new_bh) || + buffer_journaled(new_bh) || + buffer_journal_dirty(new_bh), + "PAP-8140: journaled or dirty buffer %b for the new block", + new_bh); + + /* Put empty buffers into the array. */ + RFALSE(tb->FEB[tb->cur_blknum], + "PAP-8141: busy slot for new buffer"); + + set_buffer_journal_new(new_bh); + tb->FEB[tb->cur_blknum++] = new_bh; + } + + if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) + retval = REPEAT_SEARCH; + + return retval; +} + +/* + * Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. + */ +static int get_lfree(struct tree_balance *tb, int h) +{ + struct buffer_head *l, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (l = tb->FL[h]) == NULL) + return 0; + + if (f == l) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS(l); + f = l; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); +} + +/* + * Get free space of the right neighbor, + * which is stored in the parent node of the right neighbor. + */ +static int get_rfree(struct tree_balance *tb, int h) +{ + struct buffer_head *r, *f; + int order; + + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || + (r = tb->FR[h]) == NULL) + return 0; + + if (f == r) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } + + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); + +} + +/* Check whether left neighbor is in memory. */ +static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) +{ + struct buffer_head *father, *left; + struct super_block *sb = tb->tb_sb; + b_blocknr_t left_neighbor_blocknr; + int left_neighbor_position; + + /* Father of the left neighbor does not exist. */ + if (!tb->FL[h]) + return 0; + + /* Calculate father of the node to be balanced. */ + father = PATH_H_PBUFFER(tb->tb_path, h + 1); + + RFALSE(!father || + !B_IS_IN_TREE(father) || + !B_IS_IN_TREE(tb->FL[h]) || + !buffer_uptodate(father) || + !buffer_uptodate(tb->FL[h]), + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", + father, tb->FL[h]); + + /* + * Get position of the pointer to the left neighbor + * into the left father. + */ + left_neighbor_position = (father == tb->FL[h]) ? + tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); + /* Get left neighbor block number. */ + left_neighbor_blocknr = + B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { + + RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), + "vs-8170: left neighbor (%b %z) is not in the tree", + left, left); + put_bh(left); + return 1; + } + + return 0; +} + +#define LEFT_PARENTS 'l' +#define RIGHT_PARENTS 'r' + +static void decrement_key(struct cpu_key *key) +{ + /* call item specific function for this key */ + item_ops[cpu_key_k_type(key)]->decrement_key(key); +} + +/* + * Calculate far left/right parent of the left/right neighbor of the + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor + * of the parent F[h]. + * Calculate left/right common parent of the current node and L[h]/R[h]. + * Calculate left/right delimiting key position. + * Returns: PATH_INCORRECT - path in the tree is not correct + * SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function + * worked + */ +static int get_far_parent(struct tree_balance *tb, + int h, + struct buffer_head **pfather, + struct buffer_head **pcom_father, char c_lr_par) +{ + struct buffer_head *parent; + INITIALIZE_PATH(s_path_to_neighbor_father); + struct treepath *path = tb->tb_path; + struct cpu_key s_lr_father_key; + int counter, + position = INT_MAX, + first_last_position = 0, + path_offset = PATH_H_PATH_OFFSET(path, h); + + /* + * Starting from F[h] go upwards in the tree, and look for the common + * ancestor of F[h], and its neighbor l/r, that should be obtained. + */ + + counter = path_offset; + + RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8180: invalid path length"); + + for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { + /* + * Check whether parent of the current buffer in the path + * is really parent in the tree. + */ + if (!B_IS_IN_TREE + (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) + return REPEAT_SEARCH; + + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(path, + counter - 1)) > + B_NR_ITEMS(parent)) + return REPEAT_SEARCH; + + /* + * Check whether parent at the path really points + * to the child. + */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) + return REPEAT_SEARCH; + + /* + * Return delimiting key if position in the parent is not + * equal to first/last one. + */ + if (c_lr_par == RIGHT_PARENTS) + first_last_position = B_NR_ITEMS(parent); + if (position != first_last_position) { + *pcom_father = parent; + get_bh(*pcom_father); + /*(*pcom_father = parent)->b_count++; */ + break; + } + } + + /* if we are in the root of the tree, then there is no common father */ + if (counter == FIRST_PATH_ELEMENT_OFFSET) { + /* + * Check whether first buffer in the path is the + * root of the tree. + */ + if (PATH_OFFSET_PBUFFER + (tb->tb_path, + FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK(tb->tb_sb)) { + *pfather = *pcom_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; + } + + RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, + "PAP-8185: (%b %z) level too small", + *pcom_father, *pcom_father); + + /* Check whether the common parent is locked. */ + + if (buffer_locked(*pcom_father)) { + + /* Release the write lock while the buffer is busy */ + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(*pcom_father); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(*pcom_father); + return REPEAT_SEARCH; + } + } + + /* + * So, we got common parent of the current node and its + * left/right neighbor. Now we are getting the parent of the + * left/right neighbor. + */ + + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key(&s_lr_father_key, + internal_key(*pcom_father, + (c_lr_par == + LEFT_PARENTS) ? (tb->lkey[h - 1] = + position - + 1) : (tb->rkey[h - + 1] = + position))); + + if (c_lr_par == LEFT_PARENTS) + decrement_key(&s_lr_father_key); + + if (search_by_key + (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, + h + 1) == IO_ERROR) + /* path is released */ + return IO_ERROR; + + if (FILESYSTEM_CHANGED_TB(tb)) { + pathrelse(&s_path_to_neighbor_father); + brelse(*pcom_father); + return REPEAT_SEARCH; + } + + *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + + RFALSE(B_LEVEL(*pfather) != h + 1, + "PAP-8190: (%b %z) level too small", *pfather, *pfather); + RFALSE(s_path_to_neighbor_father.path_length < + FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); + + s_path_to_neighbor_father.path_length--; + pathrelse(&s_path_to_neighbor_father); + return CARRY_ON; +} + +/* + * Get parents of neighbors of node in the path(S[path_offset]) and + * common parents of S[path_offset] and L[path_offset]/R[path_offset]: + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], + * CFR[path_offset]. + * Calculate numbers of left and right delimiting keys position: + * lkey[path_offset], rkey[path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function worked + */ +static int get_parents(struct tree_balance *tb, int h) +{ + struct treepath *path = tb->tb_path; + int position, + ret, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + struct buffer_head *curf, *curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + /* + * The root can not have parents. + * Release nodes which previously were obtained as + * parents of the current node neighbors. + */ + brelse(tb->FL[h]); + brelse(tb->CFL[h]); + brelse(tb->FR[h]); + brelse(tb->CFR[h]); + tb->FL[h] = NULL; + tb->CFL[h] = NULL; + tb->FR[h] = NULL; + tb->CFR[h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[path_offset] of L[path_offset]. */ + position = PATH_OFFSET_POSITION(path, path_offset - 1); + if (position) { + /* Current node is not the first child of its parent. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->lkey[h] = position - 1; + } else { + /* + * Calculate current parent of L[path_offset], which is the + * left neighbor of the current node. Calculate current + * common parent of L[path_offset] and the current node. + * Note that CFL[path_offset] not equal FL[path_offset] and + * CFL[path_offset] not equal F[path_offset]. + * Calculate lkey[path_offset]. + */ + if ((ret = get_far_parent(tb, h + 1, &curf, + &curcf, + LEFT_PARENTS)) != CARRY_ON) + return ret; + } + + brelse(tb->FL[h]); + tb->FL[h] = curf; /* New initialization of FL[h]. */ + brelse(tb->CFL[h]); + tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); + + /* Get parent FR[h] of R[h]. */ + + /* Current node is the last child of F[h]. FR[h] != F[h]. */ + if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { + /* + * Calculate current parent of R[h], which is the right + * neighbor of F[h]. Calculate current common parent of + * R[h] and current node. Note that CFR[h] not equal + * FR[path_offset] and CFR[h] not equal F[h]. + */ + if ((ret = + get_far_parent(tb, h + 1, &curf, &curcf, + RIGHT_PARENTS)) != CARRY_ON) + return ret; + } else { + /* Current node is not the last child of its parent F[h]. */ + curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); + get_bh(curf); + get_bh(curf); + tb->rkey[h] = position; + } + + brelse(tb->FR[h]); + /* New initialization of FR[path_offset]. */ + tb->FR[h] = curf; + + brelse(tb->CFR[h]); + /* New initialization of CFR[path_offset]. */ + tb->CFR[h] = curcf; + + RFALSE((curf && !B_IS_IN_TREE(curf)) || + (curcf && !B_IS_IN_TREE(curcf)), + "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); + + return CARRY_ON; +} + +/* + * it is possible to remove node as result of shiftings to + * neighbors even when we insert or paste item. + */ +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, + struct tree_balance *tb, int h) +{ + struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head *ih; + struct reiserfs_key *r_key = NULL; + + ih = item_head(Sh, 0); + if (tb->CFR[h]) + r_key = internal_key(tb->CFR[h], tb->rkey[h]); + + if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ + - + ((!h + && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) + - + ((!h && r_key + && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + + ((h) ? KEY_SIZE : 0)) { + /* node can not be removed */ + if (sfree >= levbytes) { + /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = + B_NR_ITEMS(Sh) + + ((mode == M_INSERT) ? 1 : 0); + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + } + PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); + return !NO_BALANCING_NEEDED; +} + +/* + * Check whether current node S[h] is balanced when increasing its size by + * Inserting or Pasting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +/* ip means Inserting or Pasting */ +static int ip_check_balance(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + /* + * Number of bytes that must be inserted into (value is negative + * if bytes are deleted) buffer which contains node being balanced. + * The mnemonic is that the attempted change in node space used + * level is levbytes bytes. + */ + int levbytes; + int ret; + + int lfree, sfree, rfree /* free space in L, S and R */ ; + + /* + * nver is short for number of vertixes, and lnver is the number if + * we shift to the left, rnver is the number if we shift to the + * right, and lrnver is the number if we shift in both directions. + * The goal is to minimize first the number of vertixes, and second, + * the number of vertixes whose contents are changed by shifting, + * and third the number of uncached vertixes whose contents are + * changed by shifting and must be read from disk. + */ + int nver, lnver, rnver, lrnver; + + /* + * used at leaf level only, S0 = S[0] is the node being balanced, + * sInum [ I = 0,1,2 ] is the number of items that will + * remain in node SI after balancing. S1 and S2 are new + * nodes that might be created. + */ + + /* + * we perform 8 calls to get_num_ver(). For each call we + * calculate five parameters. where 4th parameter is s1bytes + * and 5th - s2bytes + * + * s0num, s1num, s2num for 8 cases + * 0,1 - do not shift and do not shift but bottle + * 2 - shift only whole item to left + * 3 - shift to left and bottle as much as possible + * 4,5 - shift to right (whole items and as much as possible + * 6,7 - shift to both directions (whole items and as much as possible) + */ + short snum012[40] = { 0, }; + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head *Sh; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if (!Sh) { + if (!h) + reiserfs_panic(tb->tb_sb, "vs-8210", + "S[0] can not be 0"); + switch (ret = get_empty_nodes(tb, h)) { + /* no balancing for higher levels needed */ + case CARRY_ON: + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return ret; + default: + reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " + "return value of get_empty_nodes"); + } + } + + /* get parents of S[h] neighbors. */ + ret = get_parents(tb, h); + if (ret != CARRY_ON) + return ret; + + sfree = B_FREE_SPACE(Sh); + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* and new item fits into node S[h] without any shifting */ + if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == + NO_BALANCING_NEEDED) + return NO_BALANCING_NEEDED; + + create_virtual_node(tb, h); + + /* + * determine maximal number of items we can shift to the left + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the left neighbor from the left most liquid + * item that cannot be shifted from S[0] entirely (returned value) + */ + check_left(tb, h, lfree); + + /* + * determine maximal number of items we can shift to the right + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the right neighbor from the right most liquid + * item that cannot be shifted from S[0] entirely (returned value) + */ + check_right(tb, h, rfree); + + /* + * all contents of internal node S[h] can be moved into its + * neighbors, S[h] will be removed after balancing + */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* + * Since we are working on internal nodes, and our internal + * nodes have fixed size entries, then we can balance by the + * number of items rather than the space they consume. In this + * routine we set the left node equal to the right node, + * allowing a difference of less than or equal to 1 child + * pointer. + */ + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* + * this checks balance condition, that any two neighboring nodes + * can not fit in one node + */ + RFALSE(h && + (tb->lnum[h] >= vn->vn_nr_item + 1 || + tb->rnum[h] >= vn->vn_nr_item + 1), + "vs-8220: tree is not balanced on internal level"); + RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), + "vs-8225: tree is not balanced on leaf level"); + + /* + * all contents of S[0] can be moved into its neighbors + * S[0] will be removed after balancing. + */ + if (!h && is_leaf_removable(tb)) + return CARRY_ON; + + /* + * why do we perform this check here rather than earlier?? + * Answer: we can win 1 node in some cases above. Moreover we + * checked it above, when we checked, that S[0] is not removable + * in principle + */ + + /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { + if (!h) + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + { + int lpar, rpar, nset, lset, rset, lrset; + /* regular overflowing of the node */ + + /* + * get_num_ver works in 2 modes (FLOW & NO_FLOW) + * lpar, rpar - number of items we can shift to left/right + * neighbor (including splitting item) + * nset, lset, rset, lrset - shows, whether flowing items + * give better packing + */ +#define FLOW 1 +#define NO_FLOW 0 /* do not any splitting */ + + /* we choose one of the following */ +#define NOTHING_SHIFT_NO_FLOW 0 +#define NOTHING_SHIFT_FLOW 5 +#define LEFT_SHIFT_NO_FLOW 10 +#define LEFT_SHIFT_FLOW 15 +#define RIGHT_SHIFT_NO_FLOW 20 +#define RIGHT_SHIFT_FLOW 25 +#define LR_SHIFT_NO_FLOW 30 +#define LR_SHIFT_FLOW 35 + + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + /* + * calculate number of blocks S[h] must be split into when + * nothing is shifted to the neighbors, as well as number of + * items in each part of the split node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, h ? vn->vn_nr_item : 0, -1, + snum012, NO_FLOW); + + if (!h) { + int nver1; + + /* + * note, that in this case we try to bottle + * between S[0] and S1 (S1 - the first new node) + */ + nver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * l_shift_num first items and l_shift_bytes of the right + * most liquid item to be shifted are shifted to the left + * neighbor, as well as number of items in each part of the + * splitted node (s012 numbers), and number of bytes + * (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, h ? vn->vn_nr_item : 0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lnver1; + + lnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * r_shift_num first items and r_shift_bytes of the left most + * liquid item to be shifted are shifted to the right neighbor, + * as well as number of items in each part of the splitted + * node (s012 numbers), and number of bytes (s1bytes) of the + * shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int rnver1; + + rnver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } + + /* + * calculate number of blocks S[h] must be split into when + * items are shifted in both directions, as well as number + * of items in each part of the splitted node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lrnver1; + + lrnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } + + /* + * Our general shifting strategy is: + * 1) to minimized number of new nodes; + * 2) to minimized number of neighbors involved in shifting; + * 3) to minimized number of disk reads; + */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) { + RFALSE(h && + (tb->lnum[h] != 1 || + tb->rnum[h] != 1 || + lrnver != 1 || rnver != 2 || lnver != 2 + || h != 1), "vs-8230: bad h"); + if (lrset == LR_SHIFT_FLOW) + set_parameters(tb, h, tb->lnum[h], tb->rnum[h], + lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters(tb, h, + tb->lnum[h] - + ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - + ((tb->rbytes == -1) ? 0 : 1), + lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + + /* + * if shifting doesn't lead to better packing + * then don't shift + */ + if (nver == lrnver) { + set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, + -1); + return CARRY_ON; + } + + /* + * now we know that for better packing shifting in only one + * direction either to the left or to the right is required + */ + + /* + * if shifting to the left is better than + * shifting to the right + */ + if (lnver < rnver) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* + * if shifting to the right is better than + * shifting to the left + */ + if (lnver > rnver) { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + + /* + * now shifting in either direction gives the same number + * of nodes and we can make use of the cached neighbors + */ + if (is_left_neighbor_in_cache(tb, h)) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* + * shift to the right independently on whether the + * right neighbor in cache or not + */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting for INTERNAL node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + * + * Note: Items of internal nodes have fixed size, so the balance condition for + * the internal part of S+tree is as for the B-trees. + */ +static int dc_check_balance_internal(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* + * Sh is the node whose balance is currently being checked, + * and Fh is its father. + */ + struct buffer_head *Sh, *Fh; + int maxsize, ret; + int lfree, rfree /* free space in L and R */ ; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + Fh = PATH_H_PPARENT(tb->tb_path, h); + + maxsize = MAX_CHILD_SIZE(Sh); + + /* + * using tb->insert_size[h], which is negative in this case, + * create_virtual_node calculates: + * new_nr_item = number of items node would have if operation is + * performed without balancing (new_nr_item); + */ + create_virtual_node(tb, h); + + if (!Fh) { /* S[h] is the root. */ + /* no balancing for higher levels needed */ + if (vn->vn_nr_item > 0) { + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + /* + * new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. + */ + set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* + * Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. + */ + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { + /* + * Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. + */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { + /* All contents of S[h] can be moved to L[h]. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, + -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to R[h]. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, + -1); + return CARRY_ON; + } + } + + /* + * All contents of S[h] can be moved to the neighbors + * (L[h] & R[h]). + */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - + tb->rnum[h] + vn->vn_nr_item + 1) / 2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, + 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + /* + * Current node contain insufficient number of items. + * Balancing is required. + */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache(tb, h) + || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* For internal nodes try to borrow item from a neighbor */ + RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { + int from_l; + + from_l = + (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; + } + + set_parameters(tb, h, 0, + -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); + return CARRY_ON; +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Truncating for LEAF node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance_leaf(struct tree_balance *tb, int h) +{ + struct virtual_node *vn = tb->tb_vn; + + /* + * Number of bytes that must be deleted from + * (value is negative if bytes are deleted) buffer which + * contains node being balanced. The mnemonic is that the + * attempted change in node space used level is levbytes bytes. + */ + int levbytes; + + /* the maximal item size */ + int maxsize, ret; + + /* + * S0 is the node whose balance is currently being checked, + * and F0 is its father. + */ + struct buffer_head *S0, *F0; + int lfree, rfree /* free space in L and R */ ; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + F0 = PATH_H_PPARENT(tb->tb_path, 0); + + levbytes = tb->insert_size[h]; + + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if (!F0) { /* S[0] is the root now. */ + + RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), + "vs-8240: attempt to create empty buffer tree"); + + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ((ret = get_parents(tb, h)) != CARRY_ON) + return ret; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + create_virtual_node(tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable(tb, lfree, rfree)) + return CARRY_ON; + + /* + * determine maximal number of items we can shift to the left/right + * neighbor and the maximal number of bytes that can flow to the + * left/right neighbor from the left/right most liquid item that + * cannot be shifted from S[0] entirely + */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + + RFALSE(!tb->FL[h], + "vs-8245: dc_check_balance_leaf: FL[h] must exist"); + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* + * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). + * Set parameters and return + */ + if (is_leaf_removable(tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} + +/* + * Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance(struct tree_balance *tb, int h) +{ + RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), + "vs-8250: S is not initialized"); + + if (h) + return dc_check_balance_internal(tb, h); + else + return dc_check_balance_leaf(tb, h); +} + +/* + * Check whether current node S[h] is balanced. + * Calculate parameters for balancing for current level h. + * Parameters: + * + * tb tree_balance structure: + * + * tb is a large structure that must be read about in the header + * file at the same time as this procedure if the reader is + * to successfully understand this procedure + * + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste, d - delete, c - cut. + * Returns: 1 - schedule occurred; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int check_balance(int mode, + struct tree_balance *tb, + int h, + int inum, + int pos_in_item, + struct item_head *ins_ih, const void *data) +{ + struct virtual_node *vn; + + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; + + RFALSE(mode == M_INSERT && !vn->vn_ins_ih, + "vs-8255: ins_ih can not be 0 in insert mode"); + + /* Calculate balance parameters when size of node is increasing. */ + if (tb->insert_size[h] > 0) + return ip_check_balance(tb, h); + + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance(tb, h); +} + +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent(struct tree_balance *tb, int h) +{ + struct buffer_head *bh; + struct treepath *path = tb->tb_path; + int position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); + + /* We are in the root or in the new root. */ + if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, + "PAP-8260: invalid offset in the path"); + + if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; + PATH_OFFSET_POSITION(path, path_offset - 1) = 0; + return CARRY_ON; + } + /* Root is changed and we must recalculate the path. */ + return REPEAT_SEARCH; + } + + /* Parent in the path is not in the tree. */ + if (!B_IS_IN_TREE + (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) + return REPEAT_SEARCH; + + if ((position = + PATH_OFFSET_POSITION(path, + path_offset - 1)) > B_NR_ITEMS(bh)) + return REPEAT_SEARCH; + + /* Parent in the path is not parent of the current node in the tree. */ + if (B_N_CHILD_NUM(bh, position) != + PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) + return REPEAT_SEARCH; + + if (buffer_locked(bh)) { + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(bh); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + /* + * Parent in the path is unlocked and really parent + * of the current node. + */ + return CARRY_ON; +} + +/* + * Using lnum[h] and rnum[h] we should determine what neighbors + * of S[h] we + * need in order to balance S[h], and get them if necessary. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_neighbors(struct tree_balance *tb, int h) +{ + int child_position, + path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); + unsigned long son_number; + struct super_block *sb = tb->tb_sb; + struct buffer_head *bh; + int depth; + + PROC_INFO_INC(sb, get_neighbors[h]); + + if (tb->lnum[h]) { + /* We need left neighbor to balance S[h]. */ + PROC_INFO_INC(sb, need_l_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FL[h] && + !PATH_OFFSET_POSITION(tb->tb_path, path_offset), + "PAP-8270: invalid position in the parent"); + + child_position = + (bh == + tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> + FL[h]); + son_number = B_N_CHILD_NUM(tb->FL[h], child_position); + depth = reiserfs_write_unlock_nested(tb->tb_sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + + RFALSE(!B_IS_IN_TREE(tb->FL[h]) || + child_position > B_NR_ITEMS(tb->FL[h]) || + B_N_CHILD_NUM(tb->FL[h], child_position) != + bh->b_blocknr, "PAP-8275: invalid parent"); + RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); + RFALSE(!h && + B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FL[0], child_position)), + "PAP-8290: invalid child size of left neighbor"); + + brelse(tb->L[h]); + tb->L[h] = bh; + } + + /* We need right neighbor to balance S[path_offset]. */ + if (tb->rnum[h]) { + PROC_INFO_INC(sb, need_r_neighbor[h]); + bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); + + RFALSE(bh == tb->FR[h] && + PATH_OFFSET_POSITION(tb->tb_path, + path_offset) >= + B_NR_ITEMS(bh), + "PAP-8295: invalid position in the parent"); + + child_position = + (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; + son_number = B_N_CHILD_NUM(tb->FR[h], child_position); + depth = reiserfs_write_unlock_nested(tb->tb_sb); + bh = sb_bread(sb, son_number); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (!bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(tb)) { + brelse(bh); + PROC_INFO_INC(sb, get_neighbors_restart[h]); + return REPEAT_SEARCH; + } + brelse(tb->R[h]); + tb->R[h] = bh; + + RFALSE(!h + && B_FREE_SPACE(bh) != + MAX_CHILD_SIZE(bh) - + dc_size(B_N_CHILD(tb->FR[0], child_position)), + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), + dc_size(B_N_CHILD(tb->FR[0], child_position))); + + } + return CARRY_ON; +} + +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) +{ + int max_num_of_items; + int max_num_of_entries; + unsigned long blocksize = sb->s_blocksize; + +#define MIN_NAME_LEN 1 + + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / + (DEH_SIZE + MIN_NAME_LEN); + + return sizeof(struct virtual_node) + + max(max_num_of_items * sizeof(struct virtual_item), + sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + + (max_num_of_entries - 1) * sizeof(__u16)); +} + +/* + * maybe we should fail balancing we are going to perform when kmalloc + * fails several times. But now it will loop until kmalloc gets + * required memory + */ +static int get_mem_for_virtual_node(struct tree_balance *tb) +{ + int check_fs = 0; + int size; + char *buf; + + size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + + /* we have to allocate more memory for virtual node */ + if (size > tb->vn_buf_size) { + if (tb->vn_buf) { + /* free memory allocated before */ + kfree(tb->vn_buf); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } + + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!buf) { + /* + * getting memory with GFP_KERNEL priority may involve + * balancing now (due to indirect_to_direct conversion + * on dcache shrinking). So, release path and collected + * resources here + */ + free_buffers_in_tb(tb); + buf = kmalloc(size, GFP_NOFS); + if (!buf) { + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule(); + return REPEAT_SEARCH; + } + + tb->vn_buf = buf; + } + + if (check_fs && FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + return CARRY_ON; +} + +#ifdef CONFIG_REISERFS_CHECK +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{ + if (bh) { + if (atomic_read(&(bh->b_count)) <= 0) + + reiserfs_panic(sb, "jmacd-1", "negative or zero " + "reference counter for buffer %s[%d] " + "(%b)", descr, level, bh); + + if (!buffer_uptodate(bh)) + reiserfs_panic(sb, "jmacd-2", "buffer is not up " + "to date %s[%d] (%b)", + descr, level, bh); + + if (!B_IS_IN_TREE(bh)) + reiserfs_panic(sb, "jmacd-3", "buffer is not " + "in tree %s[%d] (%b)", + descr, level, bh); + + if (bh->b_bdev != sb->s_bdev) + reiserfs_panic(sb, "jmacd-4", "buffer has wrong " + "device %s[%d] (%b)", + descr, level, bh); + + if (bh->b_size != sb->s_blocksize) + reiserfs_panic(sb, "jmacd-5", "buffer has wrong " + "blocksize %s[%d] (%b)", + descr, level, bh); + + if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) + reiserfs_panic(sb, "jmacd-6", "buffer block " + "number too high %s[%d] (%b)", + descr, level, bh); + } +} +#else +static void tb_buffer_sanity_check(struct super_block *sb, + struct buffer_head *bh, + const char *descr, int level) +{; +} +#endif + +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) +{ + return reiserfs_prepare_for_journal(s, bh, 0); +} + +static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) +{ + struct buffer_head *locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; + + do { + + locked = NULL; + + for (i = tb->tb_path->path_length; + !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { + if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { + /* + * if I understand correctly, we can only + * be sure the last buffer in the path is + * in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(tb->tb_path) == + PATH_OFFSET_PBUFFER(tb->tb_path, i)) + tb_buffer_sanity_check(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i), "S", + tb->tb_path-> + path_length - i); +#endif + if (!clear_all_dirty_bits(tb->tb_sb, + PATH_OFFSET_PBUFFER + (tb->tb_path, + i))) { + locked = + PATH_OFFSET_PBUFFER(tb->tb_path, + i); + } + } + } + + for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; + i++) { + + if (tb->lnum[i]) { + + if (tb->L[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->L[i], + "L", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->L[i])) + locked = tb->L[i]; + } + + if (!locked && tb->FL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FL[i], + "FL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FL[i])) + locked = tb->FL[i]; + } + + if (!locked && tb->CFL[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFL[i], + "CFL", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFL[i])) + locked = tb->CFL[i]; + } + + } + + if (!locked && (tb->rnum[i])) { + + if (tb->R[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->R[i], + "R", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->R[i])) + locked = tb->R[i]; + } + + if (!locked && tb->FR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->FR[i], + "FR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FR[i])) + locked = tb->FR[i]; + } + + if (!locked && tb->CFR[i]) { + tb_buffer_sanity_check(tb->tb_sb, + tb->CFR[i], + "CFR", i); + if (!clear_all_dirty_bits + (tb->tb_sb, tb->CFR[i])) + locked = tb->CFR[i]; + } + } + } + + /* + * as far as I can tell, this is not required. The FEB list + * seems to be full of newly allocated nodes, which will + * never be locked, dirty, or anything else. + * To be safe, I'm putting in the checks and waits in. + * For the moment, they are needed to keep the code in + * journal.c from complaining about the buffer. + * That code is inside CONFIG_REISERFS_CHECK as well. --clm + */ + for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + if (!clear_all_dirty_bits + (tb->tb_sb, tb->FEB[i])) + locked = tb->FEB[i]; + } + } + + if (locked) { + int depth; +#ifdef CONFIG_REISERFS_CHECK + repeat_counter++; + if ((repeat_counter % 10000) == 0) { + reiserfs_warning(tb->tb_sb, "reiserfs-8200", + "too many iterations waiting " + "for buffer to unlock " + "(%b)", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return (FILESYSTEM_CHANGED_TB(tb)) ? + REPEAT_SEARCH : CARRY_ON; + } +#endif + depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(locked); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } + + } while (locked); + + return CARRY_ON; +} + +/* + * Prepare for balancing, that is + * get all necessary parents, and neighbors; + * analyze what and where should be moved; + * get sufficient number of new nodes; + * Balancing will start only after all resources will be collected at a time. + * + * When ported to SMP kernels, only at the last moment after all needed nodes + * are collected in cache, will the resources be locked using the usual + * textbook ordered lock acquisition algorithms. Note that ensuring that + * this code neither write locks what it does not need to write lock nor locks + * out of order will be a pain in the butt that could have been avoided. + * Grumble grumble. -Hans + * + * fix is meant in the sense of render unchanging + * + * Latency might be improved by first gathering a list of what buffers + * are needed and then getting as many of them in parallel as possible? -Hans + * + * Parameters: + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) + * tb tree_balance structure; + * inum item number in S[h]; + * pos_in_item - comment this if you can + * ins_ih item head of item being inserted + * data inserted item or data to be pasted + * Returns: 1 - schedule occurred while the function worked; + * 0 - schedule didn't occur while the function worked; + * -1 - if no_disk_space + */ + +int fix_nodes(int op_mode, struct tree_balance *tb, + struct item_head *ins_ih, const void *data) +{ + int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); + int pos_in_item; + + /* + * we set wait_tb_buffers_run when we have to restore any dirty + * bits cleared during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0; + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; + + pos_in_item = tb->tb_path->pos_in_item; + + tb->fs_gen = get_generation(tb->tb_sb); + + /* + * we prepare and log the super here so it will already be in the + * transaction when do_balance needs to change it. + * This way do_balance won't have to schedule when trying to prepare + * the super for logging + */ + reiserfs_prepare_for_journal(tb->tb_sb, + SB_BUFFER_WITH_SB(tb->tb_sb), 1); + journal_mark_dirty(tb->transaction_handle, + SB_BUFFER_WITH_SB(tb->tb_sb)); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked(tbS0)) { + int depth = reiserfs_write_unlock_nested(tb->tb_sb); + __wait_on_buffer(tbS0); + reiserfs_write_lock_nested(tb->tb_sb, depth); + if (FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; + } +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(tb->tb_sb)->cur_tb) { + print_cur_tb("fix_nodes"); + reiserfs_panic(tb->tb_sb, "PAP-8305", + "there is pending do_balance"); + } + + if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " + "not uptodate at the beginning of fix_nodes " + "or not in tree (mode %c)", + tbS0, tbS0, op_mode); + + /* Check parameters. */ + switch (op_mode) { + case M_INSERT: + if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) + reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " + "item number %d (in S0 - %d) in case " + "of insert", item_num, + B_NR_ITEMS(tbS0)); + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { + print_block(tbS0, 0, -1, -1); + reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " + "item number(%d); mode = %c " + "insert_size = %d", + item_num, op_mode, + tb->insert_size[0]); + } + break; + default: + reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " + "of operation"); + } +#endif + + if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) + /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ + return REPEAT_SEARCH; + + /* Starting from the leaf level; for all levels h of the tree. */ + for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { + ret = get_direct_parent(tb, h); + if (ret != CARRY_ON) + goto repeat; + + ret = check_balance(op_mode, tb, h, item_num, + pos_in_item, ins_ih, data); + if (ret != CARRY_ON) { + if (ret == NO_BALANCING_NEEDED) { + /* No balancing for higher levels needed. */ + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + if (h != MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + /* + * ok, analysis and resource gathering + * are complete + */ + break; + } + goto repeat; + } + + ret = get_neighbors(tb, h); + if (ret != CARRY_ON) + goto repeat; + + /* + * No disk space, or schedule occurred and analysis may be + * invalid and needs to be redone. + */ + ret = get_empty_nodes(tb, h); + if (ret != CARRY_ON) + goto repeat; + + /* + * We have a positive insert size but no nodes exist on this + * level, this means that we are creating a new root. + */ + if (!PATH_H_PBUFFER(tb->tb_path, h)) { + + RFALSE(tb->blknum[h] != 1, + "PAP-8350: creating new empty root"); + + if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { + /* + * The tree needs to be grown, so this node S[h] + * which is the root node is split into two nodes, + * and a new node (S[h+1]) will be created to + * become the root node. + */ + if (tb->blknum[h] > 1) { + + RFALSE(h == MAX_HEIGHT - 1, + "PAP-8355: attempt to create too high of a tree"); + + tb->insert_size[h + 1] = + (DC_SIZE + + KEY_SIZE) * (tb->blknum[h] - 1) + + DC_SIZE; + } else if (h < MAX_HEIGHT - 1) + tb->insert_size[h + 1] = 0; + } else + tb->insert_size[h + 1] = + (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); + } + + ret = wait_tb_buffers_until_unlocked(tb); + if (ret == CARRY_ON) { + if (FILESYSTEM_CHANGED_TB(tb)) { + wait_tb_buffers_run = 1; + ret = REPEAT_SEARCH; + goto repeat; + } else { + return CARRY_ON; + } + } else { + wait_tb_buffers_run = 1; + goto repeat; + } + +repeat: + /* + * fix_nodes was unable to perform its calculation due to + * filesystem got changed under us, lack of free disk space or i/o + * failure. If the first is the case - the search will be + * repeated. For now - free all resources acquired so far except + * for the new allocated nodes + */ + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + } else { + pathrelse(tb->tb_path); + } + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, + tb-> + CFR[i]); + } + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + + tb->L[i] = NULL; + tb->R[i] = NULL; + tb->FL[i] = NULL; + tb->FR[i] = NULL; + tb->CFL[i] = NULL; + tb->CFR[i] = NULL; + } + + if (wait_tb_buffers_run) { + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) + reiserfs_restore_prepared_buffer + (tb->tb_sb, tb->FEB[i]); + } + } + return ret; + } + +} + +void unfix_nodes(struct tree_balance *tb) +{ + int i; + + /* Release path buffers. */ + pathrelse_and_restore(tb->tb_sb, tb->tb_path); + + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + } + + /* deal with list of allocated (used and unused) nodes */ + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; + /* + * de-allocated block which was not used by + * balancing and bforget about buffer for it + */ + brelse(tb->FEB[i]); + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse(tb->used[i]); + } + } + + kfree(tb->vn_buf); + +} diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c new file mode 100644 index 000000000..7a26c4fe6 --- /dev/null +++ b/fs/reiserfs/hashes.c @@ -0,0 +1,177 @@ + +/* + * Keyed 32-bit hash function using TEA in a Davis-Meyer function + * H0 = Key + * Hi = E Mi(Hi-1) + Hi-1 + * + * (see Applied Cryptography, 2nd edition, p448). + * + * Jeremy Fitzhardinge 1998 + * + * Jeremy has agreed to the contents of reiserfs/README. -Hans + * Yura's function is added (04/07/2000) + */ + +#include +#include "reiserfs.h" +#include + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + u32 sum = 0; \ + int n = rounds; \ + u32 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + +u32 keyed_hash(const signed char *msg, int len) +{ + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; + + u32 h0 = k[0], h1 = k[1]; + u32 a, b, c, d; + u32 pad; + int i; + + /* assert(len >= 0 && len < 256); */ + + pad = (u32) len | ((u32) len << 8); + pad |= pad << 16; + + while (len >= 16) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + d = (u32) msg[12] | + (u32) msg[13] << 8 | + (u32) msg[14] << 16 | (u32) msg[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + msg += 16; + } + + if (len >= 12) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + + d = pad; + for (i = 12; i < len; i++) { + d <<= 8; + d |= msg[i]; + } + } else if (len >= 8) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + + c = d = pad; + for (i = 8; i < len; i++) { + c <<= 8; + c |= msg[i]; + } + } else if (len >= 4) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + + b = c = d = pad; + for (i = 4; i < len; i++) { + b <<= 8; + b |= msg[i]; + } + } else { + a = b = c = d = pad; + for (i = 0; i < len; i++) { + a <<= 8; + a |= msg[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0 ^ h1; +} + +/* + * What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README + */ +u32 yura_hash(const signed char *msg, int len) +{ + int j, pow; + u32 a, c; + int i; + + for (pow = 1, i = 1; i < len; i++) + pow = pow * 10; + + if (len == 1) + a = msg[0] - 48; + else + a = (msg[0] - 48) * pow; + + for (i = 1; i < len; i++) { + c = msg[i] - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +u32 r5_hash(const signed char *msg, int len) +{ + u32 a = 0; + while (*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; +} diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c new file mode 100644 index 000000000..b751eea32 --- /dev/null +++ b/fs/reiserfs/ibalance.c @@ -0,0 +1,1160 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* this is one and only function that is used outside (do_balance.c) */ +int balance_internal(struct tree_balance *, + int, int, struct item_head *, struct buffer_head **); + +/* + * modes of internal_shift_left, internal_shift_right and + * internal_insert_childs + */ +#define INTERNAL_SHIFT_FROM_S_TO_L 0 +#define INTERNAL_SHIFT_FROM_R_TO_S 1 +#define INTERNAL_SHIFT_FROM_L_TO_S 2 +#define INTERNAL_SHIFT_FROM_S_TO_R 3 +#define INTERNAL_INSERT_TO_S 4 +#define INTERNAL_INSERT_TO_L 5 +#define INTERNAL_INSERT_TO_R 6 + +static void internal_define_dest_src_infos(int shift_mode, + struct tree_balance *tb, + int h, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *d_key, struct buffer_head **cf) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_S_TO_L: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + /* dest position is analog of dest->b_item_order */ + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_R_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + break; + + default: + reiserfs_panic(tb->tb_sb, "ibalance-1", + "shift type is unknown (%d)", + shift_mode); + } +} + +/* + * Insert count node pointers into buffer cur before position to + 1. + * Insert count items into buffer cur before position to. + * Items and node pointers are specified by inserted and bh respectively. + */ +static void internal_insert_childs(struct buffer_info *cur_bi, + int to, int count, + struct item_head *inserted, + struct buffer_head **bh) +{ + struct buffer_head *cur = cur_bi->bi_bh; + struct block_head *blkh; + int nr; + struct reiserfs_key *ih; + struct disk_child new_dc[2]; + struct disk_child *dc; + int i; + + if (count <= 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + RFALSE(count > 2, "too many children (%d) are to be inserted", count); + RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE), + "no enough free space (%d), needed %d bytes", + B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE)); + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur, to + 1); + + memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i++) { + put_dc_size(&new_dc[i], + MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); + put_dc_block_number(&new_dc[i], bh[i]->b_blocknr); + } + memcpy(dc, new_dc, DC_SIZE * count); + + /* prepare space for count items */ + ih = internal_key(cur, ((to == -1) ? 0 : to)); + + memmove(ih + count, ih, + (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy(ih, inserted, KEY_SIZE); + if (count > 1) + memcpy(ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - count * (DC_SIZE + + KEY_SIZE)); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* + * Delete del_num items and node pointers from buffer cur starting from + * the first_i'th item and first_p'th pointers respectively. + */ +static void internal_delete_pointers_items(struct buffer_info *cur_bi, + int first_p, + int first_i, int del_num) +{ + struct buffer_head *cur = cur_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + RFALSE(cur == NULL, "buffer is 0"); + RFALSE(del_num < 0, + "negative number of items (%d) can not be deleted", del_num); + RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1 + || first_i < 0, + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, first_p + del_num, + B_NR_ITEMS(cur) + 1, first_i); + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + if (first_p == 0 && del_num == nr + 1) { + RFALSE(first_i != 0, + "1st deleted key must have order 0, not %d", first_i); + make_empty_node(cur_bi); + return; + } + + RFALSE(first_i + del_num > B_NR_ITEMS(cur), + "first_i = %d del_num = %d " + "no so many keys (%d) in the node (%b)(%z)", + first_i, del_num, first_i + del_num, cur, cur); + + /* deleting */ + dc = B_N_CHILD(cur, first_p); + + memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = internal_key(cur, first_i); + memmove(key, key + del_num, + (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - + del_num) * DC_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + + (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } +} + +/* delete n node pointers and items starting from given position */ +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) +{ + int i_from; + + i_from = (from == 0) ? from : from - 1; + + /* + * delete n pointers starting from `from' position in CUR; + * delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items(cur_bi, from, i_from, n); +} + +/* + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer + * dest + * last_first == FIRST_TO_LAST means that we copy first items + * from src to tail of dest + * last_first == LAST_TO_FIRST means that we copy last items + * from src to head of dest + */ +static void internal_copy_pointers_items(struct buffer_info *dest_bi, + struct buffer_head *src, + int last_first, int cpy_num) +{ + /* + * ATTENTION! Number of node pointers in DEST is equal to number + * of items in DEST as delimiting key have already inserted to + * buffer dest. + */ + struct buffer_head *dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + nr_src = B_NR_ITEMS(src); + + RFALSE(dest == NULL || src == NULL, + "src (%p) or dest (%p) buffer is 0", src, dest); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "invalid last_first parameter (%d)", last_first); + RFALSE(nr_src < cpy_num - 1, + "no so many items (%d) in src (%d)", cpy_num, nr_src); + RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); + RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); + + if (cpy_num == 0) + return; + + /* coping */ + blkh = B_BLK_HEAD(dest); + nr_dest = blkh_nr_item(blkh); + + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = + nr_src - cpy_num + 1) : (dest_order = + nr_dest, + src_order = + 0); + + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD(dest, dest_order); + + memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + + /* insert pointers */ + memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); + + /* prepare space for cpy_num - 1 item headers */ + key = internal_key(dest, dest_order); + memmove(key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + + cpy_num)); + + /* insert headers */ + memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } + +} + +/* + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to + * buffer dest. + * Delete cpy_num - del_par items and node pointers from buffer src. + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. + */ +static void internal_move_pointers_items(struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int last_first, int cpy_num, + int del_par) +{ + int first_pointer; + int first_item; + + internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first, + cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* + * delete cpy_num - del_par pointers and keys starting for + * pointers with first_pointer, for key - with first_item + */ + internal_delete_pointers_items(src_bi, first_pointer, + first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = (cpy_num - del_par == + (j = + B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num + + del_par; + + internal_delete_pointers_items(src_bi, + j + 1 - cpy_num + del_par, i, + cpy_num - del_par); + } +} + +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ +static void internal_insert_key(struct buffer_info *dest_bi, + /* insert key before key with n_dest number */ + int dest_position_before, + struct buffer_head *src, int src_position) +{ + struct buffer_head *dest = dest_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + + RFALSE(dest == NULL || src == NULL, + "source(%p) or dest(%p) buffer is 0", src, dest); + RFALSE(dest_position_before < 0 || src_position < 0, + "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + RFALSE(dest_position_before > B_NR_ITEMS(dest) || + src_position >= B_NR_ITEMS(src), + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS(dest), + src_position, B_NR_ITEMS(src)); + RFALSE(B_FREE_SPACE(dest) < KEY_SIZE, + "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest)); + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + + /* prepare space for inserting key */ + key = internal_key(dest, dest_position_before); + memmove(key + 1, key, + (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy(key, internal_key(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from + * buffer src to buffer dest. + * Replace d_key'th key in buffer cfl. + * Delete pointer_amount items and node pointers from buffer src. + */ +/* this can be invoked both to shift from S to L and from R to S */ +static void internal_shift_left( + /* + * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S + */ + int mode, + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount); */ + + if (pointer_amount) { + /* + * insert delimiting key from common father of dest and + * src to node dest into position B_NR_ITEM(dest) + */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position /*src->b_item_order */ == 0) + replace_key(tb, cf, d_key_position, + src_bi. + bi_parent /*src->b_parent */ , 0); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 0); + +} + +/* + * Insert delimiting key to L[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shifts from S[h] to L[h] */ +static void internal_shift1_left(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + if (pointer_amount > 0) + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 1); +} + +/* + * Insert d_key'th (delimiting) key from buffer cfr to head of dest. + * Copy n node pointers and n - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfr. + * Delete n items and node pointers from buffer src. + */ +static void internal_shift_right( + /* + * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S + */ + int mode, + struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + int nr; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + nr = B_NR_ITEMS(src_bi.bi_bh); + + if (pointer_amount > 0) { + /* + * insert delimiting key from common father of dest + * and src to dest node into position 0 + */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { + RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || + dest_bi.bi_bh != tb->R[h], + "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h)); + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key(tb, cf, d_key_position, tb->CFL[h], + tb->lkey[h]); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 0); +} + +/* + * Insert delimiting key to R[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shift from S[h] to R[h] */ +static void internal_shift1_right(struct tree_balance *tb, + int h, int pointer_amount) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + /* insert rkey from CFR[h] to right neighbor R[h] */ + if (pointer_amount > 0) + internal_insert_key(&dest_bi, 0, cf, d_key_position); + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 1); +} + +/* + * Delete insert_num node pointers together with their left items + * and balance current node. + */ +static void balance_internal_when_delete(struct tree_balance *tb, + int h, int child_pos) +{ + int insert_num; + int n; + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + internal_delete_childs(&bi, child_pos, -insert_num); + + RFALSE(tb->blknum[h] > 1, + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); + + n = B_NR_ITEMS(tbSh); + + if (tb->lnum[h] == 0 && tb->rnum[h] == 0) { + if (tb->blknum[h] == 0) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + + RFALSE(n + || B_FREE_SPACE(tbSh) != + MAX_CHILD_SIZE(tbSh) - DC_SIZE, + "buffer must have only 0 keys (%d)", n); + RFALSE(bi.bi_parent, "root has parent (%p)", + bi.bi_parent); + + /* choose a new root */ + if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1])) + new_root = tb->R[h - 1]; + else + new_root = tb->L[h - 1]; + /* + * switch super block's tree root block + * number to the new value */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); + /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */ + PUT_SB_TREE_HEIGHT(tb->tb_sb, + SB_TREE_HEIGHT(tb->tb_sb) - 1); + + do_balance_mark_sb_dirty(tb, + REISERFS_SB(tb->tb_sb)->s_sbh, + 1); + /*&&&&&&&&&&&&&&&&&&&&&& */ + /* use check_internal if new root is an internal node */ + if (h > 1) + check_internal(new_root); + /*&&&&&&&&&&&&&&&&&&&&&& */ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + /* join S[h] with L[h] */ + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { + + RFALSE(tb->rnum[h] != 0, + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + /* join S[h] with R[h] */ + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); + + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + + /* borrow from left neighbor L[h] */ + if (tb->lnum[h] < 0) { + RFALSE(tb->rnum[h] != 0, + "wrong tb->rnum[%d]==%d when borrow from L[h]", h, + tb->rnum[h]); + internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, + -tb->lnum[h]); + return; + } + + /* borrow from right neighbor R[h] */ + if (tb->rnum[h] < 0) { + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); + internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */ + return; + } + + /* split S[h] into two parts and put them into neighbors */ + if (tb->lnum[h] > 0) { + RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + reiserfs_panic(tb->tb_sb, "ibalance-2", + "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} + +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL, + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", + tb->L[h], tb->CFL[h]); + + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; + + memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); +} + +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) +{ + RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL, + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", + tb->R[h], tb->CFR[h]); + RFALSE(B_NR_ITEMS(tb->R[h]) == 0, + "R[h] can not be empty if it exists (item number=%d)", + B_NR_ITEMS(tb->R[h])); + + memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); +} + + +/* + * if inserting/pasting { + * child_pos is the position of the node-pointer in S[h] that + * pointed to S[h-1] before balancing of the h-1 level; + * this means that new pointers and items must be inserted AFTER + * child_pos + * } else { + * it is the position of the leftmost pointer that must be deleted + * (together with its corresponding key to the left of the pointer) + * as a result of the previous level's balancing. + * } + */ + +int balance_internal(struct tree_balance *tb, + int h, /* level of the tree */ + int child_pos, + /* key for insertion on higher level */ + struct item_head *insert_key, + /* node for insertion on higher level */ + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + /* + * we return this: it is 0 if there is no S[h], + * else it is tb->S[h]->b_item_order + */ + int order; + int insert_num, n, k; + struct buffer_head *S_new; + struct item_head new_insert_key; + struct buffer_head *new_insert_ptr = NULL; + struct item_head *new_insert_key_addr = insert_key; + + RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); + + PROC_INFO_INC(tb->tb_sb, balance_at[h]); + + order = + (tbSh) ? PATH_H_POSITION(tb->tb_path, + h + 1) /*tb->S[h]->b_item_order */ : 0; + + /* + * Using insert_size[h] calculate the number insert_num of items + * that must be inserted to or deleted from S[h]. + */ + insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper * */ + RFALSE(insert_num < -2 || insert_num > 2, + "incorrect number of items inserted to the internal node (%d)", + insert_num); + RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); + + /* Make balance in case insert_num < 0 */ + if (insert_num < 0) { + balance_internal_when_delete(tb, h, child_pos); + return order; + } + + k = 0; + if (tb->lnum[h] > 0) { + /* + * shift lnum[h] items from S[h] to the left neighbor L[h]. + * check how many of new items fall into L[h] or CFL[h] after + * shifting + */ + n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ + if (tb->lnum[h] <= child_pos) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h]); + child_pos -= tb->lnum[h]; + } else if (tb->lnum[h] > child_pos + insert_num) { + /* all new items fall into L[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h] - insert_num); + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next */ + n + child_pos + 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* + * some items fall into L[h] or CFL[h], + * but some don't fall + */ + internal_shift1_left(tb, h, child_pos + 1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next, */ + n + child_pos + 1, k, + insert_key, insert_ptr); + + replace_lkey(tb, h, insert_key + k); + + /* + * replace the first node-ptr in S[h] by + * node-ptr to insert_ptr[k] + */ + dc = B_N_CHILD(tbSh, 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr[k]) - + B_FREE_SPACE(insert_ptr[k])); + put_dc_block_number(dc, insert_ptr[k]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } + /* tb->lnum[h] > 0 */ + if (tb->rnum[h] > 0) { + /*shift rnum[h] items from S[h] to the right neighbor R[h] */ + /* + * check how many of new items fall into R or CFR + * after shifting + */ + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + if (n - tb->rnum[h] >= child_pos) + /* new items fall into S[h] */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + else if (n + insert_num - tb->rnum[h] < child_pos) { + /* all new items fall into R[h] */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h],tb->S[h-1]->b_next */ + child_pos - n - insert_num + + tb->rnum[h] - 1, + insert_num, insert_key, + insert_ptr); + insert_num = 0; + } else { + struct disk_child *dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb, h, n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h], tb->R[h]->b_child, */ + 0, k, insert_key + 1, + insert_ptr + 1); + + replace_rkey(tb, h, insert_key + insert_num - k - 1); + + /* + * replace the first node-ptr in R[h] by + * node-ptr insert_ptr[insert_num-k-1] + */ + dc = B_N_CHILD(tb->R[h], 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr + [insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1])); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tb->R[h], 0); + + insert_num -= (k + 1); + } + } + + /** Fill new node that appears instead of S[h] **/ + RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); + RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); + + if (!tb->blknum[h]) { /* node S[h] is empty now */ + RFALSE(!tbSh, "S[h] is equal NULL"); + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return order; + } + + if (!tbSh) { + /* create new root */ + struct disk_child *dc; + struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); + struct block_head *blkh; + + if (tb->blknum[h] != 1) + reiserfs_panic(NULL, "ibalance-3", "One new node " + "required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB(tb); + blkh = B_BLK_HEAD(tbSh); + set_blkh_level(blkh, h + 1); + + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + put_dc_block_number(dc, tbSh_1->b_blocknr); + put_dc_size(dc, + (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); + + tb->insert_size[h] -= DC_SIZE; + set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = + tbSh; + + /* Change root in structure super block. */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); + PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); + do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); + } + + if (tb->blknum[h] == 2) { + int snum; + struct buffer_info dest_bi, src_bi; + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + set_blkh_level(B_BLK_HEAD(S_new), h + 1); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = NULL; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1) / 2; + if (n - snum >= child_pos) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy(&new_insert_key, internal_key(tbSh, n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, snum, 0); + } else if (n + insert_num - snum < child_pos) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* + * new_insert_key = (n + insert_item - snum)'th + * key in S[h] + */ + memcpy(&new_insert_key, + internal_key(tbSh, n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + snum - insert_num, 0); + + /* + * insert insert_num keys and node-pointers + * into S_new + */ + internal_insert_childs(&dest_bi, + /*S_new,tb->S[h-1]->b_next, */ + child_pos - n - insert_num + + snum - 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + n - child_pos + 1, 1); + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs(&dest_bi, /*S_new, */ 0, k, + insert_key + 1, insert_ptr + 1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key, insert_key + insert_num - k - 1, + KEY_SIZE); + /* + * replace first node-ptr in S_new by node-ptr + * to insert_ptr[insert_num-k-1] + */ + + dc = B_N_CHILD(S_new, 0); + put_dc_size(dc, + (MAX_CHILD_SIZE + (insert_ptr[insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1]))); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, S_new, 0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + + RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) + || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", + S_new); + + /* S_new is released in unfix_nodes */ + } + + n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ + + if (0 <= child_pos && child_pos <= n && insert_num > 0) { + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + internal_insert_childs(&bi, /*tbSh, */ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ + child_pos, insert_num, insert_key, + insert_ptr); + } + + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); + insert_ptr[0] = new_insert_ptr; + + return order; +} diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c new file mode 100644 index 000000000..f6f2fbad9 --- /dev/null +++ b/fs/reiserfs/inode.c @@ -0,0 +1,3461 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +void reiserfs_evict_inode(struct inode *inode) +{ + /* + * We need blocks for transaction + (user+group) quota + * update (possibly delete) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + int err; + + if (!inode->i_nlink && !is_bad_inode(inode)) + dquot_initialize(inode); + + truncate_inode_pages_final(&inode->i_data); + if (inode->i_nlink) + goto no_delete; + + /* + * The = 0 happens when we abort creating a new inode + * for some reason like lack of space.. + * also handles bad_inode case + */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { + + reiserfs_delete_xattrs(inode); + + reiserfs_write_lock(inode->i_sb); + + if (journal_begin(&th, inode->i_sb, jbegin_count)) + goto out; + reiserfs_update_inode_transaction(inode); + + reiserfs_discard_prealloc(&th, inode); + + err = reiserfs_delete_object(&th, inode); + + /* + * Do quota update inside a transaction for journaled quotas. + * We must do that after delete_object so that quota updates + * go into the same transaction as stat data deletion + */ + if (!err) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } + + if (journal_end(&th)) + goto out; + + /* + * check return value from reiserfs_delete_object after + * ending the transaction + */ + if (err) + goto out; + + /* + * all items of file are deleted, so we can remove + * "save" link + * we can't do anything about an error here + */ + remove_save_link(inode, 0 /* not truncate */); +out: + reiserfs_write_unlock(inode->i_sb); + } else { + /* no object items are in the tree */ + ; + } + + /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); + + dquot_drop(inode); + inode->i_blocks = 0; + return; + +no_delete: + clear_inode(inode); + dquot_drop(inode); +} + +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, + __u32 objectid, loff_t offset, int type, int length) +{ + key->version = version; + + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset(key, offset); + set_cpu_key_k_type(key, type); + key->key_length = length; +} + +/* + * take base of inode_key (it comes from inode always) (dirid, objectid) + * and version from an inode, set offset and type of key + */ +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, + int type, int length) +{ + _make_cpu_key(key, get_inode_item_key_version(inode), + le32_to_cpu(INODE_PKEY(inode)->k_dir_id), + le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, + length); +} + +/* when key is 0, do not set version and short key */ +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, + int entry_count /*or ih_free_space */ ) +{ + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = + cpu_to_le32(key->on_disk_key.k_objectid); + } + put_ih_version(ih, version); + set_le_ih_k_offset(ih, offset); + set_le_ih_k_type(ih, type); + put_ih_item_len(ih, length); + /* set_ih_free_space (ih, 0); */ + /* + * for directory items it is entry count, for directs and stat + * datas - 0xffff, for indirects - 0 + */ + put_ih_entry_count(ih, entry_count); +} + +/* + * FIXME: we might cache recently accessed indirect item + * Ugh. Not too eager for that.... + * I cut the code until such time as I see a convincing argument (benchmark). + * I don't want a bloated inode struct..., and I don't like code complexity.... + */ + +/* + * cutting the code is fine, since it really isn't in use yet and is easy + * to add back in. But, Vladimir has a really good idea here. Think + * about what happens for reading a file. For each page, + * The VFS layer calls reiserfs_readpage, who searches the tree to find + * an indirect item. This indirect item has X number of pointers, where + * X is a big number if we've done the block allocation right. But, + * we only use one or two of these pointers during each call to readpage, + * needlessly researching again later on. + * + * The size of the cache could be dynamic based on the size of the file. + * + * I'd also like to see us cache the location the stat data item, since + * we are needlessly researching for that frequently. + * + * --chris + */ + +/* + * If this page has a file tail in it, and + * it was read in by get_block_create_0, the page data is valid, + * but tail is still sitting in a direct item, and we can't write to + * it. So, look through this page, and check all the mapped buffers + * to make sure they have valid block numbers. Any that don't need + * to be unmapped, so that __block_write_begin will correctly call + * reiserfs_get_block to convert the tail into an unformatted node + */ +static inline void fix_tail_page_for_writing(struct page *page) +{ + struct buffer_head *head, *next, *bh; + + if (page && page_has_buffers(page)) { + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } +} + +/* + * reiserfs_get_block does not need to allocate a block only if it has been + * done already or non-hole position has been found in the indirect item + */ +static inline int allocation_needed(int retval, b_blocknr_t allocated, + struct item_head *ih, + __le32 * item, int pos_in_item) +{ + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && + get_block_num(item, pos_in_item)) + return 0; + return 1; +} + +static inline int indirect_item_found(int retval, struct item_head *ih) +{ + return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); +} + +static inline void set_block_dev_mapped(struct buffer_head *bh, + b_blocknr_t block, struct inode *inode) +{ + map_bh(bh, inode->i_sb, block); +} + +/* + * files which were created in the earlier version can not be longer, + * than 2 gb + */ +static int file_capable(struct inode *inode, sector_t block) +{ + /* it is new file. */ + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || + /* old file, but 'block' is inside of 2gb */ + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) + return 1; + + return 0; +} + +static int restart_transaction(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct super_block *s = th->t_super; + int err; + + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_refcount); + + pathrelse(path); + + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return 0; + } + reiserfs_update_sd(th, inode); + err = journal_end(th); + if (!err) { + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); + if (!err) + reiserfs_update_inode_transaction(inode); + } + return err; +} + +/* + * it is called by get_block when create == 0. Returns block number + * for 'block'-th logical block of file. When it hits direct item it + * returns 0 (being called from bmap) or read direct item into piece + * of page (bh_result) + * Please improve the english/clarity in the comment above, as it is + * hard to understand. + */ +static int _get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int args) +{ + INITIALIZE_PATH(path); + struct cpu_key key; + struct buffer_head *bh; + struct item_head *ih, tmp_ih; + b_blocknr_t blocknr; + char *p = NULL; + int chars; + int ret; + int result; + int done = 0; + unsigned long offset; + + /* prepare the key to look for the 'block'-th block of file */ + make_cpu_key(&key, inode, + (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, + 3); + + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) { + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + if (result == IO_ERROR) + return -EIO; + /* + * We do not return -ENOENT if there is a hole but page is + * uptodate, because it means that there is some MMAPED data + * associated with it that is yet to be written to disk. + */ + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + return -ENOENT; + } + return 0; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + if (is_indirect_le_ih(ih)) { + __le32 *ind_item = (__le32 *) ih_item_body(bh, ih); + + /* + * FIXME: here we could cache indirect item or part of it in + * the inode to avoid search_by_key in case of subsequent + * access to file + */ + blocknr = get_block_num(ind_item, path.pos_in_item); + ret = 0; + if (blocknr) { + map_bh(bh_result, inode->i_sb, blocknr); + if (path.pos_in_item == + ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { + set_buffer_boundary(bh_result); + } + } else + /* + * We do not return -ENOENT if there is a hole but + * page is uptodate, because it means that there is + * some MMAPED data associated with it that is + * yet to be written to disk. + */ + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + ret = -ENOENT; + } + + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return ret; + } + /* requested data are in direct item(s) */ + if (!(args & GET_BLOCK_READ_DIRECT)) { + /* + * we are called by bmap. FIXME: we can not map block of file + * when it is stored in direct item(s) + */ + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return -ENOENT; + } + + /* + * if we've got a direct item, and the buffer or page was uptodate, + * we don't want to pull data off disk again. skip to the + * end, where we map the buffer and return + */ + if (buffer_uptodate(bh_result)) { + goto finished; + } else + /* + * grab_tail_page can trigger calls to reiserfs_get_block on + * up to date pages without any buffers. If the page is up + * to date, we don't want read old data off disk. Set the up + * to date bit on the buffer instead and jump to the end + */ + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { + set_buffer_uptodate(bh_result); + goto finished; + } + /* read file tail into part of page */ + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); + copy_item_head(&tmp_ih, ih); + + /* + * we only want to kmap if we are reading the tail into the page. + * this is not the common case, so we don't kmap until we are + * sure we need to. But, this means the item might move if + * kmap schedules + */ + if (!p) + p = (char *)kmap(bh_result->b_page); + + p += offset; + memset(p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih(ih)) { + BUG(); + } + /* + * make sure we don't read more bytes than actually exist in + * the file. This can happen in odd cases where i_size isn't + * correct, and when direct item padding results in a few + * extra bytes at the end of the direct item + */ + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) + break; + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { + chars = + inode->i_size - (le_ih_k_offset(ih) - 1) - + path.pos_in_item; + done = 1; + } else { + chars = ih_item_len(ih) - path.pos_in_item; + } + memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars); + + if (done) + break; + + p += chars; + + /* + * we done, if read direct item is not the last item of + * node FIXME: we could try to check right delimiting key + * to see whether direct item continues in the right + * neighbor or rely on i_size + */ + if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) + break; + + /* update key to look for the next piece */ + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) + /* i/o error most likely */ + break; + bh = get_last_bh(&path); + ih = tp_item_head(&path); + } while (1); + + flush_dcache_page(bh_result->b_page); + kunmap(bh_result->b_page); + +finished: + pathrelse(&path); + + if (result == IO_ERROR) + return -EIO; + + /* + * this buffer has valid data, but isn't valid for io. mapping it to + * block #0 tells the rest of reiserfs it just has a tail in it + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_uptodate(bh_result); + return 0; +} + +/* + * this is called to create file map. So, _get_block_create_0 will not + * read direct item + */ +static int reiserfs_bmap(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + if (!file_capable(inode, block)) + return -EFBIG; + + reiserfs_write_lock(inode->i_sb); + /* do not read the direct item */ + _get_block_create_0(inode, block, bh_result, 0); + reiserfs_write_unlock(inode->i_sb); + return 0; +} + +/* + * special version of get_block that is only used by grab_tail_page right + * now. It is sent to __block_write_begin, and when you try to get a + * block past the end of the file (or a block from a hole) it returns + * -ENOENT instead of a valid buffer. __block_write_begin expects to + * be able to do i/o on the buffers returned, unless an error value + * is also returned. + * + * So, this allows __block_write_begin to be used for reading a single block + * in a page. Where it does not produce a valid page for holes, or past the + * end of the file. This turns out to be exactly what we need for reading + * tails for conversion. + * + * The point of the wrapper is forcing a certain value for create, even + * though the VFS layer is calling this function with create==1. If you + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, + * don't use this function. +*/ +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, + int create) +{ + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); +} + +/* + * This is special helper for reiserfs_get_block in case we are executing + * direct_IO request. + */ +static int reiserfs_get_blocks_direct_io(struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + int ret; + + bh_result->b_page = NULL; + + /* + * We set the b_size before reiserfs_get_block call since it is + * referenced in convert_tail_for_hole() that may be called from + * reiserfs_get_block() + */ + bh_result->b_size = (1 << inode->i_blkbits); + + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE); + if (ret) + goto out; + + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* + * make sure future calls to the direct io funcs for this + * offset in the file fail by unmapping the buffer + */ + clear_buffer_mapped(bh_result); + ret = -EINVAL; + } + + /* + * Possible unpacked tail. Flush the data before pages have + * disappeared + */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + int err; + + reiserfs_write_lock(inode->i_sb); + + err = reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + reiserfs_write_unlock(inode->i_sb); + + if (err < 0) + ret = err; + } +out: + return ret; +} + +/* + * helper function for when reiserfs_get_block is called for a hole + * but the file tail is still in a direct item + * bh_result is the buffer head for the hole + * tail_offset is the offset of the start of the tail in the file + * + * This calls prepare_write, which will start a new transaction + * you should not be in a transaction, or have any paths held when you + * call this. + */ +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) +{ + unsigned long index; + unsigned long tail_end; + unsigned long tail_start; + struct page *tail_page; + struct page *hole_page = bh_result->b_page; + int retval = 0; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); + tail_end = (tail_start | (bh_result->b_size - 1)) + 1; + + index = tail_offset >> PAGE_CACHE_SHIFT; + /* + * hole_page can be zero in case of direct_io, we are sure + * that we cannot get here if we write with O_DIRECT into tail page + */ + if (!hole_page || index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index); + retval = -ENOMEM; + if (!tail_page) { + goto out; + } + } else { + tail_page = hole_page; + } + + /* + * we don't have to make sure the conversion did not happen while + * we were locking the page because anyone that could convert + * must first take i_mutex. + * + * We must fix the tail page for writing because it might have buffers + * that are mapped, but have a block number of 0. This indicates tail + * data that has been read directly into the page, and + * __block_write_begin won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); + if (retval) + goto unlock; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page); + + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); + +unlock: + if (tail_page != hole_page) { + unlock_page(tail_page); + page_cache_release(tail_page); + } +out: + return retval; +} + +static inline int _allocate_block(struct reiserfs_transaction_handle *th, + sector_t block, + struct inode *inode, + b_blocknr_t * allocated_block_nr, + struct treepath *path, int flags) +{ + BUG_ON(!th->t_trans_id); + +#ifdef REISERFS_PREALLOCATE + if (!(flags & GET_BLOCK_NO_IMUX)) { + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, + path, block); + } +#endif + return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, + block); +} + +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) +{ + int repeat, retval = 0; + /* b_blocknr_t is (unsigned) 32 bit int*/ + b_blocknr_t allocated_block_nr = 0; + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head *bh, *unbh = NULL; + struct item_head *ih, tmp_ih; + __le32 *item; + int done; + int fs_gen; + struct reiserfs_transaction_handle *th = NULL; + /* + * space reserved in transaction batch: + * . 3 balancings in direct->indirect conversion + * . 1 block involved into reiserfs_update_sd() + * XXX in practically impossible worst case direct2indirect() + * can incur (much) more than 3 balancings. + * quota update for user, group + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 1 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + int version; + int dangle = 1; + loff_t new_offset = + (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; + + reiserfs_write_lock(inode->i_sb); + version = get_inode_item_key_version(inode); + + if (!file_capable(inode, block)) { + reiserfs_write_unlock(inode->i_sb); + return -EFBIG; + } + + /* + * if !create, we aren't changing the FS, so we don't need to + * log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0(inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT); + reiserfs_write_unlock(inode->i_sb); + return ret; + } + + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; + + /* + * If file is of such a size, that it might have a tail and + * tails are enabled we should mark it as possibly needing + * tail packing on close + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size < i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size < i_block_size(inode))) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { +start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; + goto failure; + } + reiserfs_update_inode_transaction(inode); + } +research: + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (allocation_needed + (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + if (!th) { + pathrelse(&path); + goto start_trans; + } + + repeat = + _allocate_block(th, block, inode, &allocated_block_nr, + &path, create); + + /* + * restart the transaction to give the journal a chance to free + * some blocks. releases the path, so we have to go back to + * research if we succeed on the second try + */ + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + repeat = + _allocate_block(th, block, inode, + &allocated_block_nr, NULL, create); + + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { + goto research; + } + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; + goto failure; + } + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found(retval, ih)) { + b_blocknr_t unfm_ptr; + /* + * 'block'-th block is in the file already (there is + * corresponding cell in some indirect item). But it may be + * zero unformatted node pointer (hole) + */ + unfm_ptr = get_block_num(item, pos_in_item); + if (unfm_ptr == 0) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + set_buffer_new(bh_result); + if (buffer_dirty(bh_result) + && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); + put_block_num(item, pos_in_item, allocated_block_nr); + unfm_ptr = allocated_block_nr; + journal_mark_dirty(th, bh); + reiserfs_update_sd(th, inode); + } + set_block_dev_mapped(bh_result, unfm_ptr, inode); + pathrelse(&path); + retval = 0; + if (!dangle && th) + retval = reiserfs_end_persistent_transaction(th); + + reiserfs_write_unlock(inode->i_sb); + + /* + * the item was found, so new blocks were not added to the file + * there is no need to make sure the inode is updated with this + * transaction + */ + return retval; + } + + if (!th) { + pathrelse(&path); + goto start_trans; + } + + /* + * desired position is not found or is in the direct item. We have + * to append file with holes up to 'block'-th block converting + * direct items to indirect one if necessary + */ + done = 0; + do { + if (is_statdata_le_ih(ih)) { + __le32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head(&tmp_ih, &key, version, 1, + TYPE_INDIRECT, UNFM_P_SIZE, + 0 /* free_space */ ); + + /* + * we are going to add 'block'-th block to the file. + * Use allocated block for that + */ + if (cpu_key_k_offset(&key) == 1) { + unp = cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } + tmp_key = key; /* ;) */ + set_cpu_key_k_offset(&tmp_key, 1); + PATH_LAST_POSITION(&path)++; + + retval = + reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, + inode, (char *)&unp); + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + /* + * retval == -ENOSPC, -EDQUOT or -EIO + * or -EEXIST + */ + goto failure; + } + } else if (is_direct_le_ih(ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = + ((le_ih_k_offset(ih) - + 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + + /* + * direct item we just found fits into block we have + * to map. Convert it into unformatted node: use + * bh_result for the conversion + */ + if (tail_offset == cpu_key_k_offset(&key)) { + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* + * we have to pad file tail stored in direct + * item(s) up to block size and convert it + * to unformatted node. FIXME: this should + * also get into page cache + */ + + pathrelse(&path); + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + BUG_ON(!th->t_refcount); + if (th->t_refcount == 1) { + retval = + reiserfs_end_persistent_transaction + (th); + th = NULL; + if (retval) + goto failure; + } + + retval = + convert_tail_for_hole(inode, bh_result, + tail_offset); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, + "clm-6004", + "convert tail failed " + "inode %lu, error %d", + inode->i_ino, + retval); + if (allocated_block_nr) { + /* + * the bitmap, the super, + * and the stat data == 3 + */ + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb, 3); + if (th) + reiserfs_free_block(th, + inode, + allocated_block_nr, + 1); + } + goto failure; + } + goto research; + } + retval = + direct2indirect(th, inode, &path, unbh, + tail_offset); + if (retval) { + reiserfs_unmap_buffer(unbh); + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + /* + * it is important the set_buffer_uptodate is done + * after the direct2indirect. The buffer might + * contain valid data newer than the data on disk + * (read by readpage, changed, and then sent here by + * writepage). direct2indirect needs to know if unbh + * was already up to date, so it can decide if the + * data in unbh needs to be replaced with data from + * the disk + */ + set_buffer_uptodate(unbh); + + /* + * unbh->b_page == NULL in case of DIRECT_IO request, + * this means buffer will disappear shortly, so it + * should not be added to + */ + if (unbh->b_page) { + /* + * we've converted the tail, so we must + * flush unbh before the transaction commits + */ + reiserfs_add_tail_list(inode, unbh); + + /* + * mark it dirty now to prevent commit_write + * from adding this buffer to the inode's + * dirty buffer list + */ + /* + * AKPM: changed __mark_buffer_dirty to + * mark_buffer_dirty(). It's still atomic, + * but it sets the page dirty too, which makes + * it eligible for writeback at any time by the + * VM (which was also the case with + * __mark_buffer_dirty()) + */ + mark_buffer_dirty(unbh); + } + } else { + /* + * append indirect item with holes if needed, when + * appending pointer to 'block'-th block use block, + * which is already allocated + */ + struct cpu_key tmp_key; + /* + * We use this in case we need to allocate + * only one block which is a fastpath + */ + unp_t unf_single = 0; + unp_t *un; + __u64 max_to_insert = + MAX_ITEM_LEN(inode->i_sb->s_blocksize) / + UNFM_P_SIZE; + __u64 blocks_needed; + + RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, + "vs-804: invalid position for append"); + /* + * indirect item has to be appended, + * set up key of that position + * (key type is unimportant) + */ + make_cpu_key(&tmp_key, inode, + le_key_k_offset(version, + &ih->ih_key) + + op_bytes_number(ih, + inode->i_sb->s_blocksize), + TYPE_INDIRECT, 3); + + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), + "green-805: invalid offset"); + blocks_needed = + 1 + + ((cpu_key_k_offset(&key) - + cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> + s_blocksize_bits); + + if (blocks_needed == 1) { + un = &unf_single; + } else { + un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS); + if (!un) { + un = &unf_single; + blocks_needed = 1; + max_to_insert = 0; + } + } + if (blocks_needed <= max_to_insert) { + /* + * we are going to add target block to + * the file. Use allocated block for that + */ + un[blocks_needed - 1] = + cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } else { + /* paste hole to the indirect item */ + /* + * If kmalloc failed, max_to_insert becomes + * zero and it means we only have space for + * one block + */ + blocks_needed = + max_to_insert ? max_to_insert : 1; + } + retval = + reiserfs_paste_into_item(th, &path, &tmp_key, inode, + (char *)un, + UNFM_P_SIZE * + blocks_needed); + + if (blocks_needed != 1) + kfree(un); + + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + if (!done) { + /* + * We need to mark new file size in case + * this function will be interrupted/aborted + * later on. And we may do this only for + * holes. + */ + inode->i_size += + inode->i_sb->s_blocksize * blocks_needed; + } + } + + if (done == 1) + break; + + /* + * this loop could log more blocks than we had originally + * asked for. So, we have to allow the transaction to end + * if it is too big or too full. Update the inode so things + * are consistent if we crash before the function returns + * release the path so that anybody waiting on the path before + * ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + } + /* + * inserting indirect pointers for a hole can take a + * long time. reschedule if needed and also release the write + * lock for others. + */ + reiserfs_cond_resched(inode->i_sb); + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "vs-825", + "%K should not be found", &key); + retval = -EEXIST; + if (allocated_block_nr) + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + pathrelse(&path); + goto failure; + } + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + } while (1); + + retval = 0; + +failure: + if (th && (!dangle || (retval && !th->t_trans_id))) { + int err; + if (th->t_trans_id) + reiserfs_update_sd(th, inode); + err = reiserfs_end_persistent_transaction(th); + if (err) + retval = err; + } + + reiserfs_write_unlock(inode->i_sb); + reiserfs_check_path(&path); + return retval; +} + +static int +reiserfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); +} + +/* + * Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in + * stat item + */ +static int real_space_diff(struct inode *inode, int sd_size) +{ + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size; + + /* + * End of file is also in full block with indirect reference, so round + * up to the next block. + * + * there is just no way to know if the tail is actually packed + * on the file, so we have to assume it isn't. When we pack the + * tail, we add 4 bytes to pretend there really is an unformatted + * node pointer + */ + bytes = + ((inode->i_size + + (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + + sd_size; + return bytes; +} + +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, + int sd_size) +{ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + + (loff_t) (real_space_diff(inode, sd_size)); + } + return ((loff_t) real_space_diff(inode, sd_size)) + + (((loff_t) blocks) << 9); +} + +/* Compute number of blocks used by file in ReiserFS counting */ +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) +{ + loff_t bytes = inode_get_bytes(inode); + loff_t real_space = real_space_diff(inode, sd_size); + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t) 511; + } + + /* + * files from before the quota patch might i_blocks such that + * bytes < real_space. Deal with that here to prevent it from + * going negative. + */ + if (bytes < real_space) + return 0; + return (bytes - real_space) >> 9; +} + +/* + * BAD: new directories have stat data of new type and all other items + * of old type. Version stored in the inode says about body items, so + * in update_stat_data we can not rely on inode, but have to check + * item version directly + */ + +/* called by read_locked_inode */ +static void init_inode(struct inode *inode, struct treepath *path) +{ + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; + + bh = PATH_PLAST_BUFFER(path); + ih = tp_item_head(path); + + copy_key(INODE_PKEY(inode), &ih->ih_key); + + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + reiserfs_init_xattr_rwsem(inode); + + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = + (struct stat_data_v1 *)ih_item_body(bh, ih); + unsigned long blocks; + + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + set_nlink(inode, sd_v1_nlink(sd)); + i_uid_write(inode, sd_v1_uid(sd)); + i_gid_write(inode, sd_v1_gid(sd)); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = sd_v1_blocks(sd); + inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + + /* + * there was a bug in <=3.5.23 when i_blocks could take + * negative values. Starting from 3.5.17 this value could + * even be stored in stat data. For such files we set + * i_blocks based on file size. Just 2 notes: this can be + * wrong for sparse files. On-disk value will be only + * updated if file's inode will ever change + */ + if (inode->i_blocks > blocks) { + inode->i_blocks = blocks; + } + + rdev = sd_v1_rdev(sd); + REISERFS_I(inode)->i_first_direct_byte = + sd_v1_first_direct_byte(sd); + + /* + * an early bug in the quota code can give us an odd + * number for the block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++; + } + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); + /* + * nopack is initially zero for v1 objects. For v2 objects, + * nopack is initialised from sd_attrs + */ + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } else { + /* + * new stat data found, but object may have old items + * (directories and symlinks) + */ + struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih); + + inode->i_mode = sd_v2_mode(sd); + set_nlink(inode, sd_v2_nlink(sd)); + i_uid_write(inode, sd_v2_uid(sd)); + inode->i_size = sd_v2_size(sd); + i_gid_write(inode, sd_v2_gid(sd)); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_blocks = sd_v2_blocks(sd); + rdev = sd_v2_rdev(sd); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_generation = + le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + else + inode->i_generation = sd_v2_generation(sd); + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + REISERFS_I(inode)->i_first_direct_byte = 0; + set_inode_sd_version(inode, STAT_DATA_V2); + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); + /* + * read persistent inode attributes from sd and initialise + * generic inode flags from them + */ + REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + } +} + +/* update new stat data with inode fields */ +static void inode2sd(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data *sd_v2 = (struct stat_data *)sd; + __u16 flags; + + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); + set_sd_v2_uid(sd_v2, i_uid_read(inode)); + set_sd_v2_size(sd_v2, size); + set_sd_v2_gid(sd_v2, i_gid_read(inode)); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); + else + set_sd_v2_generation(sd_v2, inode->i_generation); + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, &flags); + set_sd_v2_attrs(sd_v2, flags); +} + +/* used to copy inode's fields to old stat data */ +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) +{ + struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; + + set_sd_v1_mode(sd_v1, inode->i_mode); + set_sd_v1_uid(sd_v1, i_uid_read(inode)); + set_sd_v1_gid(sd_v1, i_gid_read(inode)); + set_sd_v1_nlink(sd_v1, inode->i_nlink); + set_sd_v1_size(sd_v1, size); + set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); + set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); + else + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); + + /* Sigh. i_first_direct_byte is back */ + set_sd_v1_first_direct_byte(sd_v1, + REISERFS_I(inode)->i_first_direct_byte); +} + +/* + * NOTE, you must prepare the buffer head before sending it here, + * and then log it after the call + */ +static void update_stat_data(struct treepath *path, struct inode *inode, + loff_t size) +{ + struct buffer_head *bh; + struct item_head *ih; + + bh = PATH_PLAST_BUFFER(path); + ih = tp_item_head(path); + + if (!is_statdata_le_ih(ih)) + reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", + INODE_PKEY(inode), ih); + + /* path points to old stat data */ + if (stat_data_v1(ih)) { + inode2sd_v1(ih_item_body(bh, ih), inode, size); + } else { + inode2sd(ih_item_body(bh, ih), inode, size); + } + + return; +} + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size) +{ + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh; + int fs_gen; + struct item_head *ih, tmp_ih; + int retval; + + BUG_ON(!th->t_trans_id); + + /* key type is unimportant */ + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); + + for (;;) { + int pos; + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13050", + "i/o failure occurred trying to " + "update %K stat data", &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION(&path); + pathrelse(&path); + if (inode->i_nlink == 0) { + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ + return; + } + reiserfs_warning(inode->i_sb, "vs-13060", + "stat data of object %k (nlink == %d) " + "not found (pos %d)", + INODE_PKEY(inode), inode->i_nlink, + pos); + reiserfs_check_path(&path); + return; + } + + /* + * sigh, prepare_for_journal might schedule. When it + * schedules the FS might change. We have to detect that, + * and loop back to the search if the stat data item has moved + */ + bh = get_last_bh(&path); + ih = tp_item_head(&path); + copy_item_head(&tmp_ih, ih); + fs_gen = get_generation(inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + /* Stat_data item has been moved after scheduling. */ + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + continue; + } + break; + } + update_stat_data(&path, inode, size); + journal_mark_dirty(th, bh); + pathrelse(&path); + return; +} + +/* + * reiserfs_read_locked_inode is called to read the inode off disk, and it + * does a make_bad_inode when things go wrong. But, we need to make sure + * and clear the key in the private portion of the inode, otherwise a + * corresponding iput might try to delete whatever object the inode last + * represented. + */ +static void reiserfs_make_bad_inode(struct inode *inode) +{ + memset(INODE_PKEY(inode), 0, KEY_SIZE); + make_bad_inode(inode); +} + +/* + * initially this function was derived from minix or ext2's analog and + * evolved as the prototype did + */ +int reiserfs_init_locked_inode(struct inode *inode, void *p) +{ + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); + return 0; +} + +/* + * looks for stat data in the tree, and fills up the fields of in-core + * inode stat data fields + */ +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args) +{ + INITIALIZE_PATH(path_to_sd); + struct cpu_key key; + unsigned long dirino; + int retval; + + dirino = args->dirid; + + /* + * set version 1, version 2 could be used too, because stat data + * key is the same in both versions + */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.k_offset = 0; + key.on_disk_key.k_type = 0; + + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-13070", + "i/o failure occurred trying to find " + "stat data of %K", &key); + reiserfs_make_bad_inode(inode); + return; + } + + /* a stale NFS handle can trigger this without it being an error */ + if (retval != ITEM_FOUND) { + pathrelse(&path_to_sd); + reiserfs_make_bad_inode(inode); + clear_nlink(inode); + return; + } + + init_inode(inode, &path_to_sd); + + /* + * It is possible that knfsd is trying to access inode of a file + * that is being removed from the disk by some other thread. As we + * update sd on unlink all that is required is to check for nlink + * here. This bug was first found by Sizif when debugging + * SquidNG/Butterfly, forgotten, and found again after Philippe + * Gramoulle reproduced it. + + * More logical fix would require changes in fs/inode.c:iput() to + * remove inode from hash-table _after_ fs cleaned disk stuff up and + * in iget() to return NULL if I_FREEING inode is found in + * hash-table. + */ + + /* + * Currently there is one place where it's ok to meet inode with + * nlink==0: processing of open-unlinked and half-truncated files + * during mount (fs/reiserfs/super.c:finish_unfinished()). + */ + if ((inode->i_nlink == 0) && + !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { + reiserfs_warning(inode->i_sb, "vs-13075", + "dead inode read from disk %K. " + "This is likely to be race with knfsd. Ignore", + &key); + reiserfs_make_bad_inode(inode); + } + + /* init inode should be relsing */ + reiserfs_check_path(&path_to_sd); + + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + cache_no_acl(inode); +} + +/* + * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). + * + * @inode: inode from hash table to check + * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. + * + * This function is called by iget5_locked() to distinguish reiserfs inodes + * having the same inode numbers. Such inodes can only exist due to some + * error condition. One of them should be bad. Inodes with identical + * inode numbers (objectids) are distinguished by parent directory ids. + * + */ +int reiserfs_find_actor(struct inode *inode, void *opaque) +{ + struct reiserfs_iget_args *args; + + args = opaque; + /* args is already in CPU order */ + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); +} + +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) +{ + struct inode *inode; + struct reiserfs_iget_args args; + int depth; + + args.objectid = key->on_disk_key.k_objectid; + args.dirid = key->on_disk_key.k_dir_id; + depth = reiserfs_write_unlock_nested(s); + inode = iget5_locked(s, key->on_disk_key.k_objectid, + reiserfs_find_actor, reiserfs_init_locked_inode, + (void *)(&args)); + reiserfs_write_lock_nested(s, depth); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + + if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { + /* either due to i/o error or a stale NFS handle */ + iput(inode); + inode = NULL; + } + return inode; +} + +static struct dentry *reiserfs_get_dentry(struct super_block *sb, + u32 objectid, u32 dir_id, u32 generation) + +{ + struct cpu_key key; + struct inode *inode; + + key.on_disk_key.k_objectid = objectid; + key.on_disk_key.k_dir_id = dir_id; + reiserfs_write_lock(sb); + inode = reiserfs_iget(sb, &key); + if (inode && !IS_ERR(inode) && generation != 0 && + generation != inode->i_generation) { + iput(inode); + inode = NULL; + } + reiserfs_write_unlock(sb); + + return d_obtain_alias(inode); +} + +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + /* + * fhtype happens to reflect the number of u32s encoded. + * due to a bug in earlier code, fhtype might indicate there + * are more u32s then actually fitted. + * so if fhtype seems to be more than len, reduce fhtype. + * Valid types are: + * 2 - objectid + dir_id - legacy support + * 3 - objectid + dir_id + generation + * 4 - objectid + dir_id + objectid and dirid of parent - legacy + * 5 - objectid + dir_id + generation + objectid and dirid of parent + * 6 - as above plus generation of directory + * 6 does not fit in NFSv2 handles + */ + if (fh_type > fh_len) { + if (fh_type != 6 || fh_len != 5) + reiserfs_warning(sb, "reiserfs-13077", + "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fh_type, fh_len); + fh_type = fh_len; + } + if (fh_len < 2) + return NULL; + + return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], + (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); +} + +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + if (fh_type > fh_len) + fh_type = fh_len; + if (fh_type < 4) + return NULL; + + return reiserfs_get_dentry(sb, + (fh_type >= 5) ? fid->raw[3] : fid->raw[2], + (fh_type >= 5) ? fid->raw[4] : fid->raw[3], + (fh_type == 6) ? fid->raw[5] : 0); +} + +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent) +{ + int maxlen = *lenp; + + if (parent && (maxlen < 5)) { + *lenp = 5; + return FILEID_INVALID; + } else if (maxlen < 3) { + *lenp = 3; + return FILEID_INVALID; + } + + data[0] = inode->i_ino; + data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + data[2] = inode->i_generation; + *lenp = 3; + if (parent) { + data[3] = parent->i_ino; + data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id); + *lenp = 5; + if (maxlen >= 6) { + data[5] = parent->i_generation; + *lenp = 6; + } + } + return *lenp; +} + +/* + * looks for stat data, then copies fields to it, marks the buffer + * containing stat data as dirty + */ +/* + * reiserfs inodes are never really dirty, since the dirty inode call + * always logs them. This call allows the VFS inode marking routines + * to properly mark inodes for datasync and such, but only actually + * does something when called for a synchronous update. + */ +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct reiserfs_transaction_handle th; + int jbegin_count = 1; + + if (inode->i_sb->s_flags & MS_RDONLY) + return -EROFS; + /* + * memory pressure can sometimes initiate write_inode calls with + * sync == 1, + * these cases are just when the system needs ram, not when the + * inode needs to reach disk for safety, and they can safely be + * ignored because the altered inode has already been logged. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { + reiserfs_write_lock(inode->i_sb); + if (!journal_begin(&th, inode->i_sb, jbegin_count)) { + reiserfs_update_sd(&th, inode); + journal_end_sync(&th); + } + reiserfs_write_unlock(inode->i_sb); + } + return 0; +} + +/* + * stat data of new object is inserted already, this inserts the item + * containing "." and ".." entries + */ +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, struct treepath *path, + struct inode *dir) +{ + struct super_block *sb = th->t_super; + char empty_dir[EMPTY_DIR_SIZE]; + char *body = empty_dir; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, + TYPE_DIRENTRY, 3 /*key length */ ); + + /* + * compose item head for new item. Directories consist of items of + * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + * is done by reiserfs_new_inode + */ + if (old_format_only(sb)) { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } else { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new directory"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13070", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is empty directory item */ + return reiserfs_insert_item(th, path, &key, ih, inode, body); +} + +/* + * stat data of object has been inserted, this inserts the item + * containing the body of symlink + */ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, + struct treepath *path, const char *symname, + int item_len) +{ + struct super_block *sb = th->t_super; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, + le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3 /*key length */ ); + + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, + 0 /*free_space */ ); + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_error(sb, "vs-13080", + "i/o failure occurred creating new symlink"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13080", + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is body of symlink */ + return reiserfs_insert_item(th, path, &key, ih, inode, symname); +} + +/* + * inserts the stat data into the tree, and then calls + * reiserfs_new_directory (to insert ".", ".." item if new object is + * directory) or reiserfs_new_symlink (to insert symlink body if new + * object is symlink) or nothing (if new object is regular file) + + * NOTE! uid and gid must already be set in the inode. If we return + * non-zero due to an error, we have to drop the quota previously allocated + * for the fresh inode. This can only be done outside a transaction, so + * if we return non-zero, we also end the transaction. + * + * @th: active transaction handle + * @dir: parent directory for new inode + * @mode: mode of new inode + * @symname: symlink contents if inode is symlink + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for + * symlinks + * @inode: inode to be filled + * @security: optional security context to associate with this inode + */ +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, umode_t mode, const char *symname, + /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks) */ + loff_t i_size, struct dentry *dentry, + struct inode *inode, + struct reiserfs_security_handle *security) +{ + struct super_block *sb = dir->i_sb; + struct reiserfs_iget_args args; + INITIALIZE_PATH(path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + int err; + int depth; + + BUG_ON(!th->t_trans_id); + + depth = reiserfs_write_unlock_nested(sb); + err = dquot_alloc_inode(inode); + reiserfs_write_lock_nested(sb, depth); + if (err) + goto out_end_trans; + if (!dir->i_nlink) { + err = -EPERM; + goto out_bad_inode; + } + + /* item head of new item */ + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); + ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); + if (!ih.ih_key.k_objectid) { + err = -ENOMEM; + goto out_bad_inode; + } + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE); + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); + + depth = reiserfs_write_unlock_nested(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock_nested(inode->i_sb, depth); + if (err) { + err = -EINVAL; + goto out_bad_inode; + } + + if (old_format_only(sb)) + /* + * not a perfect generation count, as object ids can be reused, + * but this is as good as reiserfs can do right now. + * note that the private part of inode isn't filled in yet, + * we have to use the directory. + */ + inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); + else +#if defined( USE_INODE_GENERATION_COUNTER ) + inode->i_generation = + le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); +#else + inode->i_generation = ++event; +#endif + + /* fill stat data */ + set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); + + /* uid and gid must already be set by the caller for quota init */ + + /* symlink cannot be immutable or append only, right? */ + if (S_ISLNK(inode->i_mode)) + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_size = i_size; + inode->i_blocks = 0; + inode->i_bytes = 0; + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; + + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_attrs = + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; + sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); + reiserfs_init_xattr_rwsem(inode); + + /* key to search for correct place for new stat data */ + _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), + le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, + TYPE_STAT_DATA, 3 /*key length */ ); + + /* find proper place for inserting of stat data */ + retval = search_item(sb, &key, &path_to_key); + if (retval == IO_ERROR) { + err = -EIO; + goto out_bad_inode; + } + if (retval == ITEM_FOUND) { + pathrelse(&path_to_key); + err = -EEXIST; + goto out_bad_inode; + } + if (old_format_only(sb)) { + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ + if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { + pathrelse(&path_to_key); + err = -EINVAL; + goto out_bad_inode; + } + inode2sd_v1(&sd, inode, inode->i_size); + } else { + inode2sd(&sd, inode, inode->i_size); + } + /* + * store in in-core inode the key of stat data and version all + * object items will have (directory items will have old offset + * format, other new objects will consist of new items) + */ + if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + if (old_format_only(sb)) + set_inode_sd_version(inode, STAT_DATA_V1); + else + set_inode_sd_version(inode, STAT_DATA_V2); + + /* insert the stat data into the tree */ +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (REISERFS_I(dir)->new_packing_locality) + th->displace_new_blocks = 1; +#endif + retval = + reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, + (char *)(&sd)); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + goto out_bad_inode; + } +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + if (!th->displace_new_blocks) + REISERFS_I(dir)->new_packing_locality = 0; +#endif + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = + reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only(sb)) + i_size = ROUND_UP(i_size); + retval = + reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, + i_size); + } + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th); + goto out_inserted_sd; + } + + if (reiserfs_posixacl(inode->i_sb)) { + reiserfs_write_unlock(inode->i_sb); + retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); + reiserfs_write_lock(inode->i_sb); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning(inode->i_sb, "jdm-13090", + "ACLs aren't enabled in the fs, " + "but vfs thinks they are!"); + } else if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + + if (security->name) { + reiserfs_write_unlock(inode->i_sb); + retval = reiserfs_security_write(th, inode, security); + reiserfs_write_lock(inode->i_sb); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + retval = journal_end(th); + if (retval) + err = retval; + goto out_inserted_sd; + } + } + + reiserfs_update_sd(th, inode); + reiserfs_check_path(&path_to_key); + + return 0; + +out_bad_inode: + /* Invalidate the object, nothing was inserted yet */ + INODE_PKEY(inode)->k_objectid = 0; + + /* Quota change must be inside a transaction for journaling */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + +out_end_trans: + journal_end(th); + /* + * Drop can be outside and it needs more credits so it's better + * to have it outside + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_drop(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + +out_inserted_sd: + clear_nlink(inode); + th->t_trans_id = 0; /* so the caller can't use this handle later */ + unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ + iput(inode); + return err; +} + +/* + * finds the tail page in the page cache, + * reads the last block in. + * + * On success, page_result is set to a locked, pinned page, and bh_result + * is set to an up to date buffer for the last block in the file. returns 0. + * + * tail conversion is not done, so bh_result might not be valid for writing + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before + * trying to write the block. + * + * on failure, nonzero is returned, page_result and bh_result are untouched. + */ +static int grab_tail_page(struct inode *inode, + struct page **page_result, + struct buffer_head **bh_result) +{ + + /* + * we want the page with the last byte in the file, + * not the page that will hold the next byte for appending + */ + unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long pos = 0; + unsigned long start = 0; + unsigned long blocksize = inode->i_sb->s_blocksize; + unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1); + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int error; + + /* + * we know that we are only called with inode->i_size > 0. + * we also know that a file tail can never be as big as a block + * If i_size % blocksize == 0, our file is currently block aligned + * and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT; + } + page = grab_cache_page(inode->i_mapping, index); + error = -ENOMEM; + if (!page) { + goto out; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize; + + error = __block_write_begin(page, start, offset - start, + reiserfs_get_block_create_0); + if (error) + goto unlock; + + head = page_buffers(page); + bh = head; + do { + if (pos >= start) { + break; + } + bh = bh->b_this_page; + pos += blocksize; + } while (bh != head); + + if (!buffer_uptodate(bh)) { + /* + * note, this should never happen, prepare_write should be + * taking care of this for us. If the buffer isn't up to + * date, I've screwed up the code to find the buffer, or the + * code to call prepare_write + */ + reiserfs_error(inode->i_sb, "clm-6000", + "error reading block %lu", bh->b_blocknr); + error = -EIO; + goto unlock; + } + *bh_result = bh; + *page_result = page; + +out: + return error; + +unlock: + unlock_page(page); + page_cache_release(page); + return error; +} + +/* + * vfs version of truncate file. Must NOT be called with + * a transaction already started. + * + * some code taken from block_truncate_page + */ +int reiserfs_truncate_file(struct inode *inode, int update_timestamps) +{ + struct reiserfs_transaction_handle th; + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned length; + struct page *page = NULL; + int error; + struct buffer_head *bh = NULL; + int err2; + + reiserfs_write_lock(inode->i_sb); + + if (inode->i_size > 0) { + error = grab_tail_page(inode, &page, &bh); + if (error) { + /* + * -ENOENT means we truncated past the end of the + * file, and get_block_create_0 could not find a + * block to read in, which is ok. + */ + if (error != -ENOENT) + reiserfs_error(inode->i_sb, "clm-6001", + "grab_tail_page failed %d", + error); + page = NULL; + bh = NULL; + } + } + + /* + * so, if page != NULL, we have a buffer head for the offset at + * the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + * then we have an unformatted node. Otherwise, we have a direct item, + * and no zeroing is required on disk. We zero after the truncate, + * because the truncate might pack the item anyway + * (it will unmap bh if it packs). + * + * it is enough to reserve space in transaction for 2 balancings: + * one for "save" link adding and another for the first + * cut_from_item. 1 is for update_sd + */ + error = journal_begin(&th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + reiserfs_update_inode_transaction(inode); + if (update_timestamps) + /* + * we are doing real truncate: if the system crashes + * before the last transaction of truncating gets committed + * - on reboot the file either appears truncated properly + * or not truncated at all + */ + add_save_link(&th, inode, 1); + err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); + error = journal_end(&th); + if (error) + goto out; + + /* check reiserfs_do_truncate after ending the transaction */ + if (err2) { + error = err2; + goto out; + } + + if (update_timestamps) { + error = remove_save_link(inode, 1 /* truncate */); + if (error) + goto out; + } + + if (page) { + length = offset & (blocksize - 1); + /* if we are not on a block boundary */ + if (length) { + length = blocksize - length; + zero_user(page, offset, length); + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + mark_buffer_dirty(bh); + } + } + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock(inode->i_sb); + + return 0; +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock(inode->i_sb); + + return error; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) +{ + struct reiserfs_transaction_handle th; + int fs_gen; + struct item_head tmp_ih; + struct item_head *ih; + struct buffer_head *bh; + __le32 *item; + struct cpu_key key; + INITIALIZE_PATH(path); + int pos_in_item; + int jbegin_count = JOURNAL_PER_BALANCE_CNT; + loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; + int retval; + int use_get_block = 0; + int bytes_copied = 0; + int copy_size; + int trans_running = 0; + + /* + * catch places below that try to log something without + * starting a trans + */ + th.t_trans_id = 0; + + if (!buffer_uptodate(bh_result)) { + return -EIO; + } + + kmap(bh_result->b_page); +start_over: + reiserfs_write_lock(inode->i_sb); + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); + +research: + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out; + } + + bh = get_last_bh(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); + pos_in_item = path.pos_in_item; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning(inode->i_sb, "clm-6002", + "bytes_copied %d", bytes_copied); + } + if (!get_block_num(item, pos_in_item)) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out; + } + set_block_dev_mapped(bh_result, + get_block_num(item, pos_in_item), inode); + } else if (is_direct_le_ih(ih)) { + char *p; + p = page_address(bh_result->b_page); + p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); + copy_size = ih_item_len(ih) - pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (!trans_running) { + /* vs-3050 is gone, no need to drop the path */ + retval = journal_begin(&th, inode->i_sb, jbegin_count); + if (retval) + goto out; + reiserfs_update_inode_transaction(inode); + trans_running = 1; + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + } + + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + goto research; + } + + memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied, + copy_size); + + journal_mark_dirty(&th, bh); + bytes_copied += copy_size; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + copy_size); + goto research; + } + } else { + reiserfs_warning(inode->i_sb, "clm-6003", + "bad item inode %lu", inode->i_ino); + retval = -EIO; + goto out; + } + retval = 0; + +out: + pathrelse(&path); + if (trans_running) { + int err = journal_end(&th); + if (err) + retval = err; + trans_running = 0; + } + reiserfs_write_unlock(inode->i_sb); + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + retval = reiserfs_get_block(inode, block, bh_result, + GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX + | GET_BLOCK_NO_DANGLE); + if (!retval) { + if (!buffer_mapped(bh_result) + || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0; + goto start_over; + } + } + } + kunmap(bh_result->b_page); + + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* + * we've copied data from the page into the direct item, so the + * buffer in the page is now clean, mark it to reflect that. + */ + lock_buffer(bh_result); + clear_buffer_dirty(bh_result); + unlock_buffer(bh_result); + } + return retval; +} + +/* + * mason@suse.com: updated in 2.5.54 to follow the same general io + * start/recovery path as __block_write_full_page, along with special + * code to handle reiserfs tails. + */ +static int reiserfs_write_full_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int error = 0; + unsigned long block; + sector_t last_block; + struct buffer_head *head, *bh; + int partial = 0; + int nr = 0; + int checked = PageChecked(page); + struct reiserfs_transaction_handle th; + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + th.t_trans_id = 0; + + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + /* + * The page dirty bit is cleared before writepage is called, which + * means we have to tell create_empty_buffers to make dirty buffers + * The page really should be up to date at this point, so tossing + * in the BH_Uptodate is just a sanity check. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, s->s_blocksize, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } + head = page_buffers(page); + + /* + * last page in the file, zero out any contents past the + * last byte in the file + */ + if (page->index >= end_index) { + unsigned last_offset; + + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + unlock_page(page); + return 0; + } + zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); + } + bh = head; + block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + /* first map all the buffers, logging any direct items we find */ + do { + if (block > last_block) { + /* + * This can happen when the block size is less than + * the page size. The corresponding bytes in the page + * were zero filled above + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if ((checked || buffer_dirty(bh)) && + (!buffer_mapped(bh) || (buffer_mapped(bh) + && bh->b_blocknr == + 0))) { + /* + * not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail; + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* + * we start the transaction after map_block_for_writepage, + * because it can create holes in the file (an unbounded operation). + * starting it here, we can make a reliable estimate for how many + * blocks we're going to log + */ + if (checked) { + ClearPageChecked(page); + reiserfs_write_lock(s); + error = journal_begin(&th, s, bh_per_page + 1); + if (error) { + reiserfs_write_unlock(s); + goto fail; + } + reiserfs_update_inode_transaction(inode); + } + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + if (checked) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, bh); + continue; + } + /* + * from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (checked) { + error = journal_end(&th); + reiserfs_write_unlock(s); + if (error) + goto fail; + } + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + /* + * since any buffer might be the only dirty buffer on the page, + * the first submit_bh can bring the page out of writeback. + * be careful with the buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + + error = 0; +done: + if (nr == 0) { + /* + * if this page only had a direct item, it is very possible for + * no io to be required without there being an error. Or, + * someone else could have locked them and sent them down the + * pipe without locking the page + */ + bh = head; + do { + if (!buffer_uptodate(bh)) { + partial = 1; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (!partial) + SetPageUptodate(page); + end_page_writeback(page); + } + return error; + +fail: + /* + * catches various errors, we need to make sure any valid dirty blocks + * get to the media. The page is currently locked and not marked for + * writeback + */ + ClearPageUptodate(page); + bh = head; + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * clear any dirty bits that might have come from + * getting attached to a dirty page + */ + clear_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while (bh != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; +} + +static int reiserfs_readpage(struct file *f, struct page *page) +{ + return block_read_full_page(page, reiserfs_get_block); +} + +static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + reiserfs_wait_on_write_block(inode->i_sb); + return reiserfs_write_full_page(page, wbc); +} + +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + +static int reiserfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode; + struct page *page; + pgoff_t index; + int ret; + int old_ref = 0; + + inode = mapping->host; + *fsdata = NULL; + if (flags & AOP_FLAG_CONT_EXPAND && + (pos & (inode->i_sb->s_blocksize - 1)) == 0) { + pos ++; + *fsdata = (void *)(unsigned long)flags; + } + + index = pos >> PAGE_CACHE_SHIFT; + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + reiserfs_wait_on_write_block(inode->i_sb); + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + ret = __block_write_begin(page, pos, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested + * above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + if (ret) { + unlock_page(page); + page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); + } + return ret; +} + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) +{ + struct inode *inode = page->mapping->host; + int ret; + int old_ref = 0; + int depth; + + depth = reiserfs_write_unlock_nested(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); + + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; + } + + ret = __block_write_begin(page, from, len, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested + * above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + return ret; + +} + +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) +{ + return generic_block_bmap(as, block, reiserfs_bmap); +} + +static int reiserfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th; + unsigned start; + bool locked = false; + + if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) + pos ++; + + reiserfs_wait_on_write_block(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) + th = current->journal_info; + else + th = NULL; + + start = pos & (PAGE_CACHE_SIZE - 1); + if (unlikely(copied < len)) { + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start + copied, start + len); + } + flush_dcache_page(page); + + reiserfs_commit_page(inode, page, start, start + copied); + + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. + */ + if (pos + copied > inode->i_size) { + struct reiserfs_transaction_handle myth; + reiserfs_write_lock(inode->i_sb); + locked = true; + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos + copied; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around on + * the dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth); + if (ret) + goto journal_error; + } + if (th) { + if (!locked) { + reiserfs_write_lock(inode->i_sb); + locked = true; + } + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + +out: + if (locked) + reiserfs_write_unlock(inode->i_sb); + unlock_page(page); + page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + + return ret == 0 ? copied : ret; + +journal_error: + reiserfs_write_unlock(inode->i_sb); + locked = false; + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + goto out; +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; + int depth; + + depth = reiserfs_write_unlock_nested(inode->i_sb); + reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); + + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); + + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. + */ + if (pos > inode->i_size) { + struct reiserfs_transaction_handle myth; + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) + goto journal_error; + + reiserfs_update_inode_transaction(inode); + inode->i_size = pos; + /* + * this will just nest into our transaction. It's important + * to use mark_inode_dirty so the inode gets pushed around + * on the dirty lists, and so that O_SYNC works as expected + */ + mark_inode_dirty(inode); + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth); + if (ret) + goto journal_error; + } + if (th) { + if (!update_sd) + mark_inode_dirty(inode); + ret = reiserfs_end_persistent_transaction(th); + if (ret) + goto out; + } + +out: + return ret; + +journal_error: + if (th) { + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + } + + return ret; +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (sd_attrs & REISERFS_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (sd_attrs & REISERFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + if (sd_attrs & REISERFS_NOTAIL_FL) + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } +} + +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) +{ + if (reiserfs_attrs(inode->i_sb)) { + if (inode->i_flags & S_IMMUTABLE) + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; + if (inode->i_flags & S_SYNC) + *sd_attrs |= REISERFS_SYNC_FL; + else + *sd_attrs &= ~REISERFS_SYNC_FL; + if (inode->i_flags & S_NOATIME) + *sd_attrs |= REISERFS_NOATIME_FL; + else + *sd_attrs &= ~REISERFS_NOATIME_FL; + if (REISERFS_I(inode)->i_flags & i_nopack_mask) + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; + } +} + +/* + * decide if this buffer needs to stay around for data logging or ordered + * write purposes + */ +static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +{ + int ret = 1; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + + lock_buffer(bh); + spin_lock(&j->j_dirty_buffers_lock); + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* + * the page is locked, and the only places that log a data buffer + * also lock the page. + */ + if (reiserfs_file_data_log(inode)) { + /* + * very conservative, leave the buffer pinned if + * anyone might need it. + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0; + } + } else if (buffer_dirty(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* + * why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) + && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } +free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); + return ret; +} + +/* clm -- taken from fs/buffer.c:block_invalidate_page */ +static void reiserfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); + int ret = 1; + + BUG_ON(!PageLocked(page)); + + if (!partial_page) + ClearPageChecked(page); + + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (next_off > stop) + goto out; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (!partial_page && ret) { + ret = try_to_release_page(page, 0); + /* maybe should BUG_ON(!ret); - neilb */ + } +out: + return; +} + +static int reiserfs_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + if (reiserfs_file_data_log(inode)) { + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); + } + return __set_page_dirty_buffers(page); +} + +/* + * Returns 1 if the page's buffers were dropped. The page is locked. + * + * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads + * in the buffers at page_buffers(page). + * + * even in -o notail mode, we can't be sure an old mount without -o notail + * didn't create files with tails. + */ +static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + struct inode *inode = page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + struct buffer_head *head; + struct buffer_head *bh; + int ret = 1; + + WARN_ON(PageChecked(page)); + spin_lock(&j->j_dirty_buffers_lock); + head = page_buffers(page); + bh = head; + do { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { + ret = 0; + break; + } + } + bh = bh->b_this_page; + } while (bh != head); + if (ret) + ret = try_to_free_buffers(page); + spin_unlock(&j->j_dirty_buffers_lock); + return ret; +} + +/* + * We thank Mingming Cao for helping us understand in great detail what + * to do in this section of the code. + */ +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, offset, + reiserfs_get_blocks_direct_io); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + count; + + if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { + truncate_setsize(inode, isize); + reiserfs_vfs_truncate_file(inode); + } + } + + return ret; +} + +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + /* must be turned off for recursive notify_change calls */ + ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + reiserfs_write_lock(inode->i_sb); + if (attr->ia_valid & ATTR_SIZE) { + /* + * version 2 items will be caught by the s_maxbytes check + * done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + reiserfs_write_unlock(inode->i_sb); + error = -EFBIG; + goto out; + } + + inode_dio_wait(inode); + + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand_simple(inode, attr->ia_size); + if (REISERFS_I(inode)->i_prealloc_count > 0) { + int err; + struct reiserfs_transaction_handle th; + /* we're changing at most 2 bitmaps, inode + super */ + err = journal_begin(&th, inode->i_sb, 4); + if (!err) { + reiserfs_discard_prealloc(&th, inode); + err = journal_end(&th); + } + if (err) + error = err; + } + if (error) { + reiserfs_write_unlock(inode->i_sb); + goto out; + } + /* + * file size is changed, ctime and mtime are + * to be updated + */ + attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); + } + } + reiserfs_write_unlock(inode->i_sb); + + if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && + (get_inode_sd_version(inode) == STAT_DATA_V1)) { + /* stat data of format v3.5 has 16 bit uid and gid */ + error = -EINVAL; + goto out; + } + + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * + (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + + REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + + 2; + + error = reiserfs_chown_xattrs(inode, attr); + + if (error) + return error; + + /* + * (user+group)*(old+new) structure - we count quota + * info and , inode write (sb, inode) + */ + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); + if (error) + goto out; + error = dquot_transfer(inode, attr); + reiserfs_write_lock(inode->i_sb); + if (error) { + journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + goto out; + } + + /* + * Update corresponding info in inode so that everything + * is in one transaction + */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + mark_inode_dirty(inode); + error = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error) + goto out; + } + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = inode_newsize_ok(inode, attr->ia_size); + if (!error) { + /* + * Could race against reiserfs_file_release + * if called from NFS, so take tailpack mutex. + */ + mutex_lock(&REISERFS_I(inode)->tailpack); + truncate_setsize(inode, attr->ia_size); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); + } + } + + if (!error) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + + if (!error && reiserfs_posixacl(inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod(inode); + } + +out: + return error; +} + +const struct address_space_operations reiserfs_address_space_operations = { + .writepage = reiserfs_writepage, + .readpage = reiserfs_readpage, + .readpages = reiserfs_readpages, + .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, + .write_begin = reiserfs_write_begin, + .write_end = reiserfs_write_end, + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO, + .set_page_dirty = reiserfs_set_page_dirty, +}; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c new file mode 100644 index 000000000..6ec8a30a0 --- /dev/null +++ b/fs/reiserfs/ioctl.c @@ -0,0 +1,230 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include + +/* + * reiserfs_ioctl - handler for ioctl for inode + * supported commands: + * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect + * and prevent packing file (argument arg has t + * be non-zero) + * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION + * 3) That's all for a while ... + */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + unsigned int flags; + int err = 0; + + reiserfs_write_lock(inode->i_sb); + + switch (cmd) { + case REISERFS_IOC_UNPACK: + if (S_ISREG(inode->i_mode)) { + if (arg) + err = reiserfs_unpack(inode, filp); + } else + err = -ENOTTY; + break; + /* + * following two cases are taken from fs/ext2/ioctl.c by Remy + * Card (card@masi.ibp.fr) + */ + case REISERFS_IOC_GETFLAGS: + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); + err = put_user(flags, (int __user *)arg); + break; + case REISERFS_IOC_SETFLAGS:{ + if (!reiserfs_attrs(inode->i_sb)) { + err = -ENOTTY; + break; + } + + err = mnt_want_write_file(filp); + if (err) + break; + + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } + /* + * Is it quota file? Do not allow user to mess with it + */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + if (((flags ^ REISERFS_I(inode)-> + i_attrs) & (REISERFS_IMMUTABLE_FL | + REISERFS_APPEND_FL)) + && !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } + if ((flags & REISERFS_NOTAIL_FL) && + S_ISREG(inode->i_mode)) { + int result; + + result = reiserfs_unpack(inode, filp); + if (result) { + err = result; + goto setflags_out; + } + } + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + break; + } + case REISERFS_IOC_GETVERSION: + err = put_user(inode->i_generation, (int __user *)arg); + break; + case REISERFS_IOC_SETVERSION: + if (!inode_owner_or_capable(inode)) { + err = -EPERM; + break; + } + err = mnt_want_write_file(filp); + if (err) + break; + if (get_user(inode->i_generation, (int __user *)arg)) { + err = -EFAULT; + goto setversion_out; + } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +setversion_out: + mnt_drop_write_file(filp); + break; + default: + err = -ENOTTY; + } + + reiserfs_write_unlock(inode->i_sb); + + return err; +} + +#ifdef CONFIG_COMPAT +long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + /* + * These are just misnamed, they actually + * get/put from/to user an int + */ + switch (cmd) { + case REISERFS_IOC32_UNPACK: + cmd = REISERFS_IOC_UNPACK; + break; + case REISERFS_IOC32_GETFLAGS: + cmd = REISERFS_IOC_GETFLAGS; + break; + case REISERFS_IOC32_SETFLAGS: + cmd = REISERFS_IOC_SETFLAGS; + break; + case REISERFS_IOC32_GETVERSION: + cmd = REISERFS_IOC_GETVERSION; + break; + case REISERFS_IOC32_SETVERSION: + cmd = REISERFS_IOC_SETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +/* + * reiserfs_unpack + * Function try to convert tail from direct item into indirect. + * It set up nopack attribute in the REISERFS_I(inode)->nopack + */ +int reiserfs_unpack(struct inode *inode, struct file *filp) +{ + int retval = 0; + int index; + struct page *page; + struct address_space *mapping; + unsigned long write_from; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (inode->i_size == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + return 0; + } + /* ioctl already done */ + if (REISERFS_I(inode)->i_flags & i_nopack_mask) { + return 0; + } + + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + + reiserfs_write_lock(inode->i_sb); + + write_from = inode->i_size & (blocksize - 1); + /* if we are on a block boundary, we are already unpacked. */ + if (write_from == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + goto out; + } + + /* + * we unpack by finding the page with the tail, and calling + * __reiserfs_write_begin on that page. This will force a + * reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT; + mapping = inode->i_mapping; + page = grab_cache_page(mapping, index); + retval = -ENOMEM; + if (!page) { + goto out; + } + retval = __reiserfs_write_begin(page, write_from, 0); + if (retval) + goto out_unlock; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page); + retval = reiserfs_commit_write(NULL, page, write_from, write_from); + REISERFS_I(inode)->i_flags |= i_nopack_mask; + +out_unlock: + unlock_page(page); + page_cache_release(page); + +out: + mutex_unlock(&inode->i_mutex); + reiserfs_write_unlock(inode->i_sb); + return retval; +} diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c new file mode 100644 index 000000000..aca73dd73 --- /dev/null +++ b/fs/reiserfs/item_ops.c @@ -0,0 +1,752 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include "reiserfs.h" + +/* + * this contains item handlers for old item types: sd, direct, + * indirect, directory + */ + +/* + * and where are the comments? how about saying where we can find an + * explanation of each item handler method? -Hans + */ + +/* stat data functions */ +static int sd_bytes_number(struct item_head *ih, int block_size) +{ + return 0; +} + +static void sd_decrement_key(struct cpu_key *key) +{ + key->on_disk_key.k_objectid--; + set_cpu_key_k_type(key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1)); +} + +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) +{ + return 0; +} + +static char *print_time(time_t t) +{ + static char timebuf[256]; + + sprintf(timebuf, "%ld", t); + return timebuf; +} + +static void sd_print_item(struct item_head *ih, char *item) +{ + printk("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = (struct stat_data_v1 *)item; + + printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + sd_v1_size(sd), sd_v1_nlink(sd), + sd_v1_first_direct_byte(sd), + print_time(sd_v1_mtime(sd))); + } else { + struct stat_data *sd = (struct stat_data *)item; + + printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), + sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + } +} + +static void sd_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int sd_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_STAT_DATA; + return 0; +} + +static int sd_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + BUG_ON(start_skip || end_skip); + return -1; +} + +static int sd_check_right(struct virtual_item *vi, int free) +{ + return -1; +} + +static int sd_part_size(struct virtual_item *vi, int first, int count) +{ + BUG_ON(count); + return 0; +} + +static int sd_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void sd_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16100", + "STATDATA, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations stat_data_ops = { + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, + .print_item = sd_print_item, + .check_item = sd_check_item, + + .create_vi = sd_create_vi, + .check_left = sd_check_left, + .check_right = sd_check_right, + .part_size = sd_part_size, + .unit_num = sd_unit_num, + .print_vi = sd_print_vi +}; + +/* direct item functions */ +static int direct_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih); +} + +/* FIXME: this should probably switch to indirect as well */ +static void direct_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direct_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); +} + +static void direct_print_item(struct item_head *ih, char *item) +{ + int j = 0; + +/* return; */ + printk("\""); + while (j < ih_item_len(ih)) + printk("%c", item[j++]); + printk("\"\n"); +} + +static void direct_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int direct_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_DIRECT; + return 0; +} + +static int direct_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % 8; + return bytes ? : -1; +} + +static int direct_check_right(struct virtual_item *vi, int free) +{ + return direct_check_left(vi, free, 0, 0); +} + +static int direct_part_size(struct virtual_item *vi, int first, int count) +{ + return count; +} + +static int direct_unit_num(struct virtual_item *vi) +{ + return vi->vi_item_len - IH_SIZE; +} + +static void direct_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16101", + "DIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations direct_ops = { + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, + .print_item = direct_print_item, + .check_item = direct_check_item, + + .create_vi = direct_create_vi, + .check_left = direct_check_left, + .check_right = direct_check_right, + .part_size = direct_part_size, + .unit_num = direct_unit_num, + .print_vi = direct_print_vi +}; + +/* indirect item functions */ +static int indirect_bytes_number(struct item_head *ih, int block_size) +{ + return ih_item_len(ih) / UNFM_P_SIZE * block_size; +} + +/* decrease offset, if it becomes 0, change type to stat data */ +static void indirect_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +/* if it is not first item of the body, then it is mergeable */ +static int indirect_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + int version = le_key_version(key); + return (le_key_k_offset(version, key) != 1); +} + +/* printing of indirect item */ +static void start_new_sequence(__u32 * start, int *len, __u32 new) +{ + *start = new; + *len = 1; +} + +static int sequence_finished(__u32 start, int *len, __u32 new) +{ + if (start == INT_MAX) + return 1; + + if (start == 0 && new == 0) { + (*len)++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len)++; + return 0; + } + return 1; +} + +static void print_sequence(__u32 start, int len) +{ + if (start == INT_MAX) + return; + + if (len == 1) + printk(" %d", start); + else + printk(" %d(%d)", start, len); +} + +static void indirect_print_item(struct item_head *ih, char *item) +{ + int j; + __le32 *unp; + __u32 prev = INT_MAX; + int num = 0; + + unp = (__le32 *) item; + + if (ih_item_len(ih) % UNFM_P_SIZE) + reiserfs_warning(NULL, "reiserfs-16102", "invalid item len"); + + printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); + for (j = 0; j < I_UNFM_NUM(ih); j++) { + if (sequence_finished(prev, &num, get_block_num(unp, j))) { + print_sequence(prev, num); + start_new_sequence(&prev, &num, get_block_num(unp, j)); + } + } + print_sequence(prev, num); + printk("]\n"); +} + +static void indirect_check_item(struct item_head *ih, char *item) +{ + /* unused */ +} + +static int indirect_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + vi->vi_index = TYPE_INDIRECT; + return 0; +} + +static int indirect_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % UNFM_P_SIZE; + return bytes ? : -1; +} + +static int indirect_check_right(struct virtual_item *vi, int free) +{ + return indirect_check_left(vi, free, 0, 0); +} + +/* + * return size in bytes of 'units' units. If first == 0 - calculate + * from the head (left), otherwise - from tail (right) + */ +static int indirect_part_size(struct virtual_item *vi, int first, int units) +{ + /* unit of indirect item is byte (yet) */ + return units; +} + +static int indirect_unit_num(struct virtual_item *vi) +{ + /* unit of indirect item is byte (yet) */ + return vi->vi_item_len - IH_SIZE; +} + +static void indirect_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "reiserfs-16103", + "INDIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +static struct item_operations indirect_ops = { + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, + .print_item = indirect_print_item, + .check_item = indirect_check_item, + + .create_vi = indirect_create_vi, + .check_left = indirect_check_left, + .check_right = indirect_check_right, + .part_size = indirect_part_size, + .unit_num = indirect_unit_num, + .print_vi = indirect_print_vi +}; + +/* direntry functions */ +static int direntry_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "vs-16090", + "bytes number is asked for direntry"); + return 0; +} + +static void direntry_decrement_key(struct cpu_key *key) +{ + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); +} + +static int direntry_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; + +} + +static void direntry_print_item(struct item_head *ih, char *item) +{ + int i; + int namelen; + struct reiserfs_de_head *deh; + char *name; + static char namebuf[80]; + + printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", + "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < ih_entry_count(ih); i++, deh++) { + namelen = + (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - + deh_location(deh); + name = item + deh_location(deh); + if (name[namelen - 1] == 0) + namelen = strlen(name); + namebuf[0] = '"'; + if (namelen > sizeof(namebuf) - 3) { + strncpy(namebuf + 1, name, sizeof(namebuf) - 3); + namebuf[sizeof(namebuf) - 2] = '"'; + namebuf[sizeof(namebuf) - 1] = 0; + } else { + memcpy(namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", + i, namebuf, + deh_dir_id(deh), deh_objectid(deh), + GET_HASH_VALUE(deh_offset(deh)), + GET_GENERATION_NUMBER((deh_offset(deh))), + (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); + } +} + +static void direntry_check_item(struct item_head *ih, char *item) +{ + int i; + struct reiserfs_de_head *deh; + + /* unused */ + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < ih_entry_count(ih); i++, deh++) { + ; + } +} + +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 + +/* + * function returns old entry number in directory item in real node + * using new entry number in virtual item in virtual node + */ +static inline int old_entry_num(int is_affected, int virtual_entry_num, + int pos_in_item, int mode) +{ + if (mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; + + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; + + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + + if (mode == M_CUT) + return virtual_entry_num + 1; + + RFALSE(mode != M_PASTE || virtual_entry_num == 0, + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", + mode); + + return virtual_entry_num - 1; +} + +/* + * Create an array of sizes of directory entries for virtual + * item. Return space used by an item. FIXME: no control over + * consuming of space used by this item handler + */ +static int direntry_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + int i, j; + int size = sizeof(struct direntry_uarea); + struct reiserfs_de_head *deh; + + vi->vi_index = TYPE_DIRENTRY; + + BUG_ON(!(vi->vi_ih) || !vi->vi_item); + + dir_u->flags = 0; + if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count(vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i++) { + j = old_entry_num(is_affected, i, vn->vn_pos_in_item, + vn->vn_mode); + dir_u->entry_sizes[i] = + (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - + deh_location(&deh[j]) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof(short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; + +#ifdef CONFIG_REISERFS_CHECK + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected + && (vn->vn_mode == M_PASTE + || vn->vn_mode == M_CUT)) ? insert_size : 0)) { + reiserfs_panic(NULL, "vs-8025", "(mode==%c, " + "insert_size==%d), invalid length of " + "directory item", + vn->vn_mode, insert_size); + } + } +#endif + + return size; + +} + +/* + * return number of entries which may fit into specified amount of + * free space, or -1 if free space is not enough even for 1 entry + */ +static int direntry_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + /* i-th entry doesn't fit into the remaining free space */ + if (dir_u->entry_sizes[i] > free) + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + + if (entries == dir_u->entry_count) { + reiserfs_panic(NULL, "item_ops-1", + "free space %d, entry_count %d", free, + dir_u->entry_count); + } + + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries < 2) + entries = 0; + + return entries ? : -1; +} + +static int direntry_check_right(struct virtual_item *vi, int free) +{ + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; + + for (i = dir_u->entry_count - 1; i >= 0; i--) { + /* i-th entry doesn't fit into the remaining free space */ + if (dir_u->entry_sizes[i] > free) + break; + + free -= dir_u->entry_sizes[i]; + entries++; + } + BUG_ON(entries == dir_u->entry_count); + + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ? : -1; +} + +/* sum of entry sizes between from-th and to-th entries including both edges */ +static int direntry_part_size(struct virtual_item *vi, int first, int count) +{ + int i, retval; + int from, to; + struct direntry_uarea *dir_u = vi->vi_uarea; + + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; + + for (i = from; i <= to; i++) + retval += dir_u->entry_sizes[i]; + + return retval; +} + +static int direntry_unit_num(struct virtual_item *vi) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + + return dir_u->entry_count; +} + +static void direntry_print_vi(struct virtual_item *vi) +{ + int i; + struct direntry_uarea *dir_u = vi->vi_uarea; + + reiserfs_warning(NULL, "reiserfs-16104", + "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i++) + printk("%d ", dir_u->entry_sizes[i]); + printk("\n"); +} + +static struct item_operations direntry_ops = { + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, + .print_item = direntry_print_item, + .check_item = direntry_check_item, + + .create_vi = direntry_create_vi, + .check_left = direntry_check_left, + .check_right = direntry_check_right, + .part_size = direntry_part_size, + .unit_num = direntry_unit_num, + .print_vi = direntry_print_vi +}; + +/* Error catching functions to catch errors caused by incorrect item types. */ +static int errcatch_bytes_number(struct item_head *ih, int block_size) +{ + reiserfs_warning(NULL, "green-16001", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_decrement_key(struct cpu_key *key) +{ + reiserfs_warning(NULL, "green-16002", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) +{ + reiserfs_warning(NULL, "green-16003", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16004", + "Invalid item type observed, run fsck ASAP"); +} + +static void errcatch_check_item(struct item_head *ih, char *item) +{ + reiserfs_warning(NULL, "green-16005", + "Invalid item type observed, run fsck ASAP"); +} + +static int errcatch_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + reiserfs_warning(NULL, "green-16006", + "Invalid item type observed, run fsck ASAP"); + /* + * We might return -1 here as well, but it won't help as + * create_virtual_node() from where this operation is called + * from is of return type void. + */ + return 0; +} + +static int errcatch_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) +{ + reiserfs_warning(NULL, "green-16007", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_check_right(struct virtual_item *vi, int free) +{ + reiserfs_warning(NULL, "green-16008", + "Invalid item type observed, run fsck ASAP"); + return -1; +} + +static int errcatch_part_size(struct virtual_item *vi, int first, int count) +{ + reiserfs_warning(NULL, "green-16009", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static int errcatch_unit_num(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16010", + "Invalid item type observed, run fsck ASAP"); + return 0; +} + +static void errcatch_print_vi(struct virtual_item *vi) +{ + reiserfs_warning(NULL, "green-16011", + "Invalid item type observed, run fsck ASAP"); +} + +static struct item_operations errcatch_ops = { + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, + errcatch_print_item, + errcatch_check_item, + + errcatch_create_vi, + errcatch_check_left, + errcatch_check_right, + errcatch_part_size, + errcatch_unit_num, + errcatch_print_vi +}; + +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) +#error Item types must use disk-format assigned values. +#endif + +struct item_operations *item_ops[TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ +}; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c new file mode 100644 index 000000000..9d6486d41 --- /dev/null +++ b/fs/reiserfs/journal.c @@ -0,0 +1,4403 @@ +/* + * Write ahead logging implementation copyright Chris Mason 2000 + * + * The background commits make this code very interrelated, and + * overly complex. I need to rethink things a bit....The major players: + * + * journal_begin -- call with the number of blocks you expect to log. + * If the current transaction is too + * old, it will block until the current transaction is + * finished, and then start a new one. + * Usually, your transaction will get joined in with + * previous ones for speed. + * + * journal_join -- same as journal_begin, but won't block on the current + * transaction regardless of age. Don't ever call + * this. Ever. There are only two places it should be + * called from, and they are both inside this file. + * + * journal_mark_dirty -- adds blocks into this transaction. clears any flags + * that might make them get sent to disk + * and then marks them BH_JDirty. Puts the buffer head + * into the current transaction hash. + * + * journal_end -- if the current transaction is batchable, it does nothing + * otherwise, it could do an async/synchronous commit, or + * a full flush of all log and real blocks in the + * transaction. + * + * flush_old_commits -- if the current transaction is too old, it is ended and + * commit blocks are sent to disk. Forces commit blocks + * to disk for all backgrounded commits that have been + * around too long. + * -- Note, if you call this as an immediate flush from + * from within kupdate, it will ignore the immediate flag + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) + +/* must be correct to keep the desc and commit structs at 4k */ +#define JOURNAL_TRANS_HALF 1018 +#define BUFNR 64 /*read ahead */ + +/* cnode stat bits. Move these into reiserfs_fs.h */ + +/* this block was freed, and can't be written. */ +#define BLOCK_FREED 2 +/* this block was freed during this transaction, and can't be written */ +#define BLOCK_FREED_HOLDER 3 + +/* used in flush_journal_list */ +#define BLOCK_NEEDS_FLUSH 4 +#define BLOCK_DIRTIED 5 + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ + +/* flags for do_journal_end */ +#define FLUSH_ALL 1 /* flush commit and real blocks */ +#define COMMIT_NOW 2 /* end and commit this transaction */ +#define WAIT 4 /* wait for the log blocks to hit the disk */ + +static int do_journal_end(struct reiserfs_transaction_handle *, int flags); +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int can_dirty(struct reiserfs_journal_cnode *cn); +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb); +static void release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(struct work_struct *work); +static void queue_log_writer(struct super_block *s); + +/* values for join in do_journal_begin_r */ +enum { + JBEGIN_REG = 0, /* regular journal begin */ + /* join the running transaction if at all possible */ + JBEGIN_JOIN = 1, + /* called from cleanup code, ignores aborted flag */ + JBEGIN_ABORT = 2, +}; + +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, + unsigned long nblocks, int join); + +static void init_journal_hash(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + memset(journal->j_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); +} + +/* + * clears BH_Dirty and sticks the buffer on the clean list. Called because + * I can't allow refile_buffer to make schedule happen after I've freed a + * block. Look at remove_from_transaction and journal_mark_freed for + * more details. + */ +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) +{ + if (bh) { + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + } + return 0; +} + +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block + *sb) +{ + struct reiserfs_bitmap_node *bn; + static int id; + + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); + if (!bn) { + return NULL; + } + bn->data = kzalloc(sb->s_blocksize, GFP_NOFS); + if (!bn->data) { + kfree(bn); + return NULL; + } + bn->id = id++; + INIT_LIST_HEAD(&bn->list); + return bn; +} + +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = journal->j_bitmap_nodes.next; + + journal->j_used_bitmap_nodes++; +repeat: + + if (entry != &journal->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list); + list_del(entry); + memset(bn->data, 0, sb->s_blocksize); + journal->j_free_bitmap_nodes--; + return bn; + } + bn = allocate_bitmap_node(sb); + if (!bn) { + yield(); + goto repeat; + } + return bn; +} +static inline void free_bitmap_node(struct super_block *sb, + struct reiserfs_bitmap_node *bn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + journal->j_used_bitmap_nodes--; + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + kfree(bn->data); + kfree(bn); + } else { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } +} + +static void allocate_bitmap_nodes(struct super_block *sb) +{ + int i; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_bitmap_node *bn = NULL; + for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { + bn = allocate_bitmap_node(sb); + if (bn) { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } else { + /* this is ok, we'll try again when more are needed */ + break; + } + } +} + +static int set_bit_in_list_bitmap(struct super_block *sb, + b_blocknr_t block, + struct reiserfs_list_bitmap *jb) +{ + unsigned int bmap_nr = block / (sb->s_blocksize << 3); + unsigned int bit_nr = block % (sb->s_blocksize << 3); + + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(sb); + } + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); + return 0; +} + +static void cleanup_bitmap_list(struct super_block *sb, + struct reiserfs_list_bitmap *jb) +{ + int i; + if (jb->bitmaps == NULL) + return; + + for (i = 0; i < reiserfs_bmap_count(sb); i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(sb, jb->bitmaps[i]); + jb->bitmaps[i] = NULL; + } + } +} + +/* + * only call this on FS unmount. + */ +static int free_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array) +{ + int i; + struct reiserfs_list_bitmap *jb; + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + cleanup_bitmap_list(sb, jb); + vfree(jb->bitmaps); + jb->bitmaps = NULL; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct list_head *next = journal->j_bitmap_nodes.next; + struct reiserfs_bitmap_node *bn; + + while (next != &journal->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list); + list_del(next); + kfree(bn->data); + kfree(bn); + next = journal->j_bitmap_nodes.next; + journal->j_free_bitmap_nodes--; + } + + return 0; +} + +/* + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. + * jb_array is the array to be filled in. + */ +int reiserfs_allocate_list_bitmaps(struct super_block *sb, + struct reiserfs_list_bitmap *jb_array, + unsigned int bmap_nr) +{ + int i; + int failed = 0; + struct reiserfs_list_bitmap *jb; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); + + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + jb->bitmaps = vzalloc(mem); + if (!jb->bitmaps) { + reiserfs_warning(sb, "clm-2000", "unable to " + "allocate bitmaps for journal lists"); + failed = 1; + break; + } + } + if (failed) { + free_list_bitmaps(sb, jb_array); + return -1; + } + return 0; +} + +/* + * find an available list bitmap. If you can't find one, flush a commit list + * and try again + */ +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, + struct reiserfs_journal_list + *jl) +{ + int i, j; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_list_bitmap *jb = NULL; + + for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { + i = journal->j_list_bitmap_index; + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; + jb = journal->j_list_bitmap + i; + if (journal->j_list_bitmap[i].journal_list) { + flush_commit_list(sb, + journal->j_list_bitmap[i]. + journal_list, 1); + if (!journal->j_list_bitmap[i].journal_list) { + break; + } + } else { + break; + } + } + /* double check to make sure if flushed correctly */ + if (jb->journal_list) + return NULL; + jb->journal_list = jl; + return jb; +} + +/* + * allocates a new chunk of X nodes, and links them all together as a list. + * Uses the cnode->next and cnode->prev pointers + * returns NULL on failure + */ +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) +{ + struct reiserfs_journal_cnode *head; + int i; + if (num_cnodes <= 0) { + return NULL; + } + head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + if (!head) { + return NULL; + } + head[0].prev = NULL; + head[0].next = head + 1; + for (i = 1; i < num_cnodes; i++) { + head[i].prev = head + (i - 1); + head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ + } + head[num_cnodes - 1].next = NULL; + return head; +} + +/* pulls a cnode off the free list, or returns NULL on failure */ +static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "get_cnode"); + + if (journal->j_cnode_free <= 0) { + return NULL; + } + journal->j_cnode_used++; + journal->j_cnode_free--; + cn = journal->j_cnode_free_list; + if (!cn) { + return cn; + } + if (cn->next) { + cn->next->prev = NULL; + } + journal->j_cnode_free_list = cn->next; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); + return cn; +} + +/* + * returns a cnode to the free list + */ +static void free_cnode(struct super_block *sb, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + reiserfs_check_lock_depth(sb, "free_cnode"); + + journal->j_cnode_used--; + journal->j_cnode_free++; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = journal->j_cnode_free_list; + if (journal->j_cnode_free_list) { + journal->j_cnode_free_list->prev = cn; + } + cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ + journal->j_cnode_free_list = cn; +} + +static void clear_prepared_bits(struct buffer_head *bh) +{ + clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); +} + +/* + * return a cnode with same dev, block number and size in table, + * or null if not found + */ +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct + super_block + *sb, + struct + reiserfs_journal_cnode + **table, + long bl) +{ + struct reiserfs_journal_cnode *cn; + cn = journal_hash(table, sb, bl); + while (cn) { + if (cn->blocknr == bl && cn->sb == sb) + return cn; + cn = cn->hnext; + } + return (struct reiserfs_journal_cnode *)0; +} + +/* + * this actually means 'can this block be reallocated yet?'. If you set + * search_all, a block can only be allocated if it is not in the current + * transaction, was not freed by the current transaction, and has no chance + * of ever being overwritten by a replay after crashing. + * + * If you don't set search_all, a block can only be allocated if it is not + * in the current transaction. Since deleting a block removes it from the + * current transaction, this case should never happen. If you don't set + * search_all, make sure you never write the block without logging it. + * + * next_zero_bit is a suggestion about the next block to try for find_forward. + * when bl is rejected because it is set in a journal list bitmap, we search + * for the next zero bit in the bitmap that rejected bl. Then, we return + * that through next_zero_bit for find_forward to try. + * + * Just because we return something in next_zero_bit does not mean we won't + * reject it on the next call to reiserfs_in_journal + */ +int reiserfs_in_journal(struct super_block *sb, + unsigned int bmap_nr, int bit_nr, int search_all, + b_blocknr_t * next_zero_bit) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn; + struct reiserfs_list_bitmap *jb; + int i; + unsigned long bl; + + *next_zero_bit = 0; /* always start this at zero. */ + + PROC_INFO_INC(sb, journal.in_journal); + /* + * If we aren't doing a search_all, this is a metablock, and it + * will be logged before use. if we crash before the transaction + * that freed it commits, this transaction won't have committed + * either, and the block will never be written + */ + if (search_all) { + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + PROC_INFO_INC(sb, journal.in_journal_bitmap); + jb = journal->j_list_bitmap + i; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, + (unsigned long *)jb->bitmaps[bmap_nr]-> + data)) { + *next_zero_bit = + find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]-> + data), + sb->s_blocksize << 3, + bit_nr + 1); + return 1; + } + } + } + + bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; + /* is it in any old transactions? */ + if (search_all + && (cn = + get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(sb, journal->j_hash_table, bl))) { + BUG(); + return 1; + } + + PROC_INFO_INC(sb, journal.in_journal_reusable); + /* safe for reuse */ + return 0; +} + +/* insert cn into table */ +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal_cnode *cn_orig; + + cn_orig = journal_hash(table, cn->sb, cn->blocknr); + cn->hnext = cn_orig; + cn->hprev = NULL; + if (cn_orig) { + cn_orig->hprev = cn; + } + journal_hash(table, cn->sb, cn->blocknr) = cn; +} + +/* lock the current transaction */ +static inline void lock_journal(struct super_block *sb) +{ + PROC_INFO_INC(sb, journal.lock_journal); + + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); +} + +/* unlock the current transaction */ +static inline void unlock_journal(struct super_block *sb) +{ + mutex_unlock(&SB_JOURNAL(sb)->j_mutex); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d", + jl->j_trans_id, jl->j_refcount); + } + if (--jl->j_refcount == 0) + kfree(jl); +} + +/* + * this used to be much more involved, and I'm keeping it just in case + * things get ugly again. it gets called by flush_commit_list, and + * cleans up any data stored about blocks freed during a transaction. + */ +static void cleanup_freed_for_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; + if (jb) { + cleanup_bitmap_list(sb, jb); + } + jl->j_list_bitmap->journal_list = NULL; + jl->j_list_bitmap = NULL; +} + +static int journal_list_still_alive(struct super_block *s, + unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct list_head *entry = &journal->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +/* + * If page->mapping was null, we failed to truncate this page for + * some reason. Most likely because it was truncated after being + * logged via data=journal. + * + * This does a check to see if the buffer belongs to one of these + * lost pages before doing the final put_bh. If page->mapping was + * null, it tries to free buffers on the page, which should make the + * final page_cache_release drop the page from the lru. + */ +static void release_buffer_page(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + if (!page->mapping && trylock_page(page)) { + page_cache_get(page); + put_bh(bh); + if (!page->mapping) + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + } else { + put_bh(bh); + } +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning(NULL, "clm-2084", + "pinned buffer %lu:%s sent to disk", + bh->b_blocknr, bdevname(bh->b_bdev, b)); + } + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + + unlock_buffer(bh); + release_buffer_page(bh); +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); +} + +static void submit_logged_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_buffer_io_sync; + clear_buffer_journal_new(bh); + clear_buffer_dirty(bh); + if (!test_clear_buffer_journal_test(bh)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +static void submit_ordered_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->nr; i++) { + submit_logged_buffer(chunk->bh[i]); + } + chunk->nr = 0; +} + +static void write_ordered_chunk(struct buffer_chunk *chunk) +{ + int i; + for (i = 0; i < chunk->nr; i++) { + submit_ordered_buffer(chunk->bh[i]); + } + chunk->nr = 0; +} + +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, + spinlock_t * lock, void (fn) (struct buffer_chunk *)) +{ + int ret = 0; + BUG_ON(chunk->nr >= CHUNK_SIZE); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) { + spin_unlock(lock); + fn(chunk); + spin_lock(lock); + } else { + fn(chunk); + } + } + return ret; +} + +static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); +static struct reiserfs_jh *alloc_jh(void) +{ + struct reiserfs_jh *jh; + while (1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); + } +} + +/* + * we want to free the jh when the buffer has been written + * and waited on + */ +void reiserfs_free_jh(struct buffer_head *bh) +{ + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } +} + +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, + int tail) +{ + struct reiserfs_jh *jh; + + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { +no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* + * buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + BUG_ON(bh->b_private); + jh->bh = bh; + bh->b_private = jh; + } + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; +} + +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +} +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +} + +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) +static int write_ordered_buffers(spinlock_t * lock, + struct reiserfs_journal *j, + struct reiserfs_journal_list *jl, + struct list_head *list) +{ + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = j->j_errno; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while (!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (!trylock_buffer(bh)) { + if (!buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + cond_resched(); + spin_lock(lock); + goto loop_next; + } + /* + * in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } + if (buffer_dirty(bh)) { + list_move(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } +loop_next: + put_bh(bh); + cond_resched_lock(lock); + } + if (chunk.nr) { + spin_unlock(lock); + write_ordered_chunk(&chunk); + spin_lock(lock); + } + while (!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) { + ret = -EIO; + } + /* + * ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a + * valid journal head from an older transaction. If someone + * else sets our buffer dirty after we write it in the first + * loop, and then someone truncates the page away, nobody + * will ever write the buffer. We're safe if we write the + * page one last time after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } + put_bh(bh); + cond_resched_lock(lock); + } + spin_unlock(lock); + return ret; +} + +static int flush_older_commits(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned int trans_id = jl->j_trans_id; + unsigned int other_trans_id; + unsigned int first_trans_id; + +find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &journal->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* + * the one we just flushed is gone, this means + * all older lists are also gone, so first_jl + * is no longer valid either. Go back to the + * beginning. + */ + if (!journal_list_still_alive + (s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &journal->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} + +static int reiserfs_async_progress_wait(struct super_block *s) +{ + struct reiserfs_journal *j = SB_JOURNAL(s); + + if (atomic_read(&j->j_async_throttle)) { + int depth; + + depth = reiserfs_write_unlock_nested(s); + congestion_wait(BLK_RW_ASYNC, HZ / 10); + reiserfs_write_lock_nested(s, depth); + } + + return 0; +} + +/* + * if this journal list still has commit blocks unflushed, send them to disk. + * + * log areas must be flushed in order (transaction 2 can't commit before + * transaction 1) Before the commit block can by written, every other log + * block must be safely on disk + */ +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + int i; + b_blocknr_t bn; + struct buffer_head *tbh = NULL; + unsigned int trans_id = jl->j_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int retval = 0; + int write_len; + int depth; + + reiserfs_check_lock_depth(s, "flush_commit_list"); + + if (atomic_read(&jl->j_older_commits_done)) { + return 0; + } + + /* + * before we can put our commit blocks on disk, we have to make + * sure everyone older than us is on disk too + */ + BUG_ON(jl->j_len <= 0); + BUG_ON(trans_id == journal->j_trans_id); + + get_journal_list(jl); + if (flushall) { + if (flush_older_commits(s, jl) == 1) { + /* + * list disappeared during flush_older_commits. + * return + */ + goto put_jl; + } + } + + /* make sure nobody is trying to flush this one at the same time */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); + + if (!journal_list_still_alive(s, trans_id)) { + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + BUG_ON(jl->j_trans_id == 0); + + /* this commit is done, exit */ + if (atomic_read(&jl->j_commit_left) <= 0) { + if (flushall) { + atomic_set(&jl->j_older_commits_done, 1); + } + mutex_unlock(&jl->j_commit_mutex); + goto put_jl; + } + + if (!list_empty(&jl->j_bh_list)) { + int ret; + + /* + * We might sleep in numerous places inside + * write_ordered_buffers. Relax the write lock. + */ + depth = reiserfs_write_unlock_nested(s); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; + reiserfs_write_lock_nested(s, depth); + } + BUG_ON(!list_empty(&jl->j_bh_list)); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. + */ + atomic_inc(&journal->j_async_throttle); + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % + SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + if (tbh) { + if (buffer_dirty(tbh)) { + depth = reiserfs_write_unlock_nested(s); + ll_rw_block(WRITE, 1, &tbh); + reiserfs_write_lock_nested(s, depth); + } + put_bh(tbh) ; + } + } + atomic_dec(&journal->j_async_throttle); + + for (i = 0; i < (jl->j_len + 1); i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(tbh); + reiserfs_write_lock_nested(s, depth); + /* + * since we're using ll_rw_blk above, it might have skipped + * over a locked buffer. Double check here + */ + /* redundant, sync_dirty_buffer() checks */ + if (buffer_dirty(tbh)) { + depth = reiserfs_write_unlock_nested(s); + sync_dirty_buffer(tbh); + reiserfs_write_lock_nested(s, depth); + } + if (unlikely(!buffer_uptodate(tbh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-601", + "buffer write failed"); +#endif + retval = -EIO; + } + /* once for journal_find_get_block */ + put_bh(tbh); + /* once due to original getblk in do_journal_end */ + put_bh(tbh); + atomic_dec(&jl->j_commit_left); + } + + BUG_ON(atomic_read(&jl->j_commit_left) != 1); + + /* + * If there was a write error in the journal - we can't commit + * this transaction - it will be invalid and, if successful, + * will just end up propagating the write error out to + * the file system. + */ + if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + depth = reiserfs_write_unlock_nested(s); + if (reiserfs_barrier_flush(s)) + __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(jl->j_commit_bh); + reiserfs_write_lock_nested(s, depth); + } + + /* + * If there was a write error in the journal - we can't commit this + * transaction - it will be invalid and, if successful, will just end + * up propagating the write error out to the filesystem. + */ + if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-615", "buffer write failed"); +#endif + retval = -EIO; + } + bforget(jl->j_commit_bh); + if (journal->j_last_commit_id != 0 && + (jl->j_trans_id - journal->j_last_commit_id) != 1) { + reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu", + journal->j_last_commit_id, jl->j_trans_id); + } + journal->j_last_commit_id = jl->j_trans_id; + + /* + * now, every commit block is on the disk. It is safe to allow + * blocks freed during this transaction to be reallocated + */ + cleanup_freed_for_journal_list(s, jl); + + retval = retval ? retval : journal->j_errno; + + /* mark the metadata dirty */ + if (!retval) + dirty_one_transaction(s, jl); + atomic_dec(&jl->j_commit_left); + + if (flushall) { + atomic_set(&jl->j_older_commits_done, 1); + } + mutex_unlock(&jl->j_commit_mutex); +put_jl: + put_journal_list(s, jl); + + if (retval) + reiserfs_abort(s, retval, "Journal write error in %s", + __func__); + return retval; +} + +/* + * flush_journal_list frequently needs to find a newer transaction for a + * given block. This does that, or returns NULL if it can't find anything + */ +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct + reiserfs_journal_cnode + *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist; + } + cn = cn->hprev; + } + return NULL; +} + +static void remove_journal_hash(struct super_block *, + struct reiserfs_journal_cnode **, + struct reiserfs_journal_list *, unsigned long, + int); + +/* + * once all the real blocks have been flushed, it is safe to remove them + * from the journal list for this transaction. Aside from freeing the + * cnode, this also allows the block to be reallocated for data blocks + * if it had been deleted. + */ +static void remove_all_from_journal_list(struct super_block *sb, + struct reiserfs_journal_list *jl, + int debug) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *last; + cn = jl->j_realblock; + + /* + * which is better, to lock once around the whole loop, or + * to lock for each call to remove_journal_hash? + */ + while (cn) { + if (cn->blocknr != 0) { + if (debug) { + reiserfs_warning(sb, "reiserfs-2201", + "block %u, bh is %d, state %ld", + cn->blocknr, cn->bh ? 1 : 0, + cn->state); + } + cn->state = 0; + remove_journal_hash(sb, journal->j_list_hash_table, + jl, cn->blocknr, 1); + } + last = cn; + cn = cn->next; + free_cnode(sb, last); + } + jl->j_realblock = NULL; +} + +/* + * if this timestamp is greater than the timestamp we wrote last to the + * header block, write it to the header block. once this is done, I can + * safely say the log area for this transaction won't ever be replayed, + * and I can start releasing blocks in this transaction for reuse as data + * blocks. called by flush_journal_list, before it calls + * remove_all_from_journal_list + */ +static int _update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int depth; + + if (reiserfs_is_journal_aborted(journal)) + return -EIO; + + if (trans_id >= journal->j_last_flush_trans_id) { + if (buffer_locked((journal->j_header_bh))) { + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock_nested(sb, depth); + if (unlikely(!buffer_uptodate(journal->j_header_bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(sb, "journal-699", + "buffer write failed"); +#endif + return -EIO; + } + } + journal->j_last_flush_trans_id = trans_id; + journal->j_first_unflushed_offset = offset; + jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> + b_data); + jh->j_last_flush_trans_id = cpu_to_le32(trans_id); + jh->j_first_unflushed_offset = cpu_to_le32(offset); + jh->j_mount_id = cpu_to_le32(journal->j_mount_id); + + set_buffer_dirty(journal->j_header_bh); + depth = reiserfs_write_unlock_nested(sb); + + if (reiserfs_barrier_flush(sb)) + __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + else + sync_dirty_buffer(journal->j_header_bh); + + reiserfs_write_lock_nested(sb, depth); + if (!buffer_uptodate(journal->j_header_bh)) { + reiserfs_warning(sb, "journal-837", + "IO error during journal replay"); + return -EIO; + } + } + return 0; +} + +static int update_journal_header_block(struct super_block *sb, + unsigned long offset, + unsigned int trans_id) +{ + return _update_journal_header_block(sb, offset, trans_id); +} + +/* +** flush any and all journal lists older than you are +** can only be called from flush_journal_list +*/ +static int flush_older_journal_lists(struct super_block *sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned int trans_id = jl->j_trans_id; + + /* + * we know we are the only ones flushing things, no extra race + * protection is required. + */ +restart: + entry = journal->j_journal_list.next; + /* Did we wrap? */ + if (entry == &journal->j_journal_list) + return 0; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + BUG_ON(other_jl->j_refcount <= 0); + /* do not flush all */ + flush_journal_list(sb, other_jl, 0); + + /* other_jl is now deleted from the list */ + goto restart; + } + return 0; +} + +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + journal->j_num_work_lists--; + } +} + +/* + * flush a journal list, both commit and real blocks + * + * always set flushall to 1, unless you are calling from inside + * flush_journal_list + * + * IMPORTANT. This can only be called while there are no journal writers, + * and the journal is locked. That means it can only be called from + * do_journal_end, or by journal_release + */ +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + struct reiserfs_journal_list *pjl; + struct reiserfs_journal_cnode *cn, *last; + int count; + int was_jwait = 0; + int was_dirty = 0; + struct buffer_head *saved_bh; + unsigned long j_len_saved = jl->j_len; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err = 0; + int depth; + + BUG_ON(j_len_saved <= 0); + + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_warning(s, "clm-2048", "called with wcount %d", + atomic_read(&journal->j_wcount)); + } + + /* if flushall == 0, the lock is already held */ + if (flushall) { + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + } else if (mutex_trylock(&journal->j_flush_mutex)) { + BUG(); + } + + count = 0; + if (j_len_saved > journal->j_trans_max) { + reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu", + j_len_saved, jl->j_trans_id); + return 0; + } + + /* if all the work is already done, get out of here */ + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { + goto flush_older_and_return; + } + + /* + * start by putting the commit list on disk. This will also flush + * the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1); + + if (!(jl->j_state & LIST_DIRTY) + && !reiserfs_is_journal_aborted(journal)) + BUG(); + + /* are we done now? */ + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { + goto flush_older_and_return; + } + + /* + * loop through each cnode, see if we need to write it, + * or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_panic(s, "journal-844", "journal list is flushing, " + "wcount is not 0"); + } + cn = jl->j_realblock; + while (cn) { + was_jwait = 0; + was_dirty = 0; + saved_bh = NULL; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode; + } + + /* + * This transaction failed commit. + * Don't write out to the disk + */ + if (!(jl->j_state & LIST_DIRTY)) + goto free_cnode; + + pjl = find_newer_jl_for_cn(cn); + /* + * the order is important here. We check pjl to make sure we + * don't clear BH_JDirty_wait if we aren't the one writing this + * block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh; + + /* + * we do this to make sure nobody releases the + * buffer while we are working with it + */ + get_bh(saved_bh); + + if (buffer_journal_dirty(saved_bh)) { + BUG_ON(!can_dirty(cn)); + was_jwait = 1; + was_dirty = 1; + } else if (can_dirty(cn)) { + /* + * everything with !pjl && jwait + * should be writable + */ + BUG(); + } + } + + /* + * if someone has this block in a newer transaction, just make + * sure they are committed, and don't try writing it to disk + */ + if (pjl) { + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1); + goto free_cnode; + } + + /* + * bh == NULL when the block got to disk on its own, OR, + * the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode; + } + + /* + * this should never happen. kupdate_one_transaction has + * this list locked while it works, so we should never see a + * buffer here that is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { + reiserfs_warning(s, "journal-813", + "BAD! buffer %llu %cdirty %cjwait, " + "not in a newer tranasction", + (unsigned long long)saved_bh-> + b_blocknr, was_dirty ? ' ' : '!', + was_jwait ? ' ' : '!'); + } + if (was_dirty) { + /* + * we inc again because saved_bh gets decremented + * at free_cnode + */ + get_bh(saved_bh); + set_bit(BLOCK_NEEDS_FLUSH, &cn->state); + lock_buffer(saved_bh); + BUG_ON(cn->blocknr != saved_bh->b_blocknr); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh); + else + unlock_buffer(saved_bh); + count++; + } else { + reiserfs_warning(s, "clm-2082", + "Unable to flush buffer %llu in %s", + (unsigned long long)saved_bh-> + b_blocknr, __func__); + } +free_cnode: + last = cn; + cn = cn->next; + if (saved_bh) { + /* + * we incremented this to keep others from + * taking the buffer head away + */ + put_bh(saved_bh); + if (atomic_read(&saved_bh->b_count) < 0) { + reiserfs_warning(s, "journal-945", + "saved_bh->b_count < 0"); + } + } + } + if (count > 0) { + cn = jl->j_realblock; + while (cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, "journal-1011", + "cn->bh is NULL"); + } + + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(cn->bh); + reiserfs_write_lock_nested(s, depth); + + if (!cn->bh) { + reiserfs_panic(s, "journal-1012", + "cn->bh is NULL"); + } + if (unlikely(!buffer_uptodate(cn->bh))) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning(s, "journal-949", + "buffer write failed"); +#endif + err = -EIO; + } + /* + * note, we must clear the JDirty_wait bit + * after the up to date check, otherwise we + * race against our flushpage routine + */ + BUG_ON(!test_clear_buffer_journal_dirty + (cn->bh)); + + /* drop one ref for us */ + put_bh(cn->bh); + /* drop one ref for journal_mark_dirty */ + release_buffer_page(cn->bh); + } + cn = cn->next; + } + } + + if (err) + reiserfs_abort(s, -EIO, + "Write error while pushing transaction to disk in %s", + __func__); +flush_older_and_return: + + /* + * before we can update the journal header block, we _must_ flush all + * real blocks from all older transactions to disk. This is because + * once the header block is updated, this transaction will not be + * replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl); + } + + err = journal->j_errno; + /* + * before we can remove everything from the hash tables for this + * transaction, we must make sure it can never be replayed + * + * since we are only called from do_journal_end, we know for sure there + * are no allocations going on while we are flushing journal lists. So, + * we only need to update the journal header block for the last list + * being flushed + */ + if (!err && flushall) { + err = + update_journal_header_block(s, + (jl->j_start + jl->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(s), + jl->j_trans_id); + if (err) + reiserfs_abort(s, -EIO, + "Write error while updating journal header in %s", + __func__); + } + remove_all_from_journal_list(s, jl, 0); + list_del_init(&jl->j_list); + journal->j_num_lists--; + del_from_work_list(s, jl); + + if (journal->j_last_flush_id != 0 && + (jl->j_trans_id - journal->j_last_flush_id) != 1) { + reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu", + journal->j_last_flush_id, jl->j_trans_id); + } + journal->j_last_flush_id = jl->j_trans_id; + + /* + * not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ + jl->j_len = 0; + atomic_set(&jl->j_nonzerolen, 0); + jl->j_start = 0; + jl->j_realblock = NULL; + jl->j_commit_bh = NULL; + jl->j_trans_id = 0; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + mutex_unlock(&journal->j_flush_mutex); + return err; +} + +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) +{ + struct reiserfs_journal_cnode *cn; + int ret = 0; + + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; + } + + cn = jl->j_realblock; + while (cn) { + /* + * if the blocknr == 0, this has been cleared from the hash, + * skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* + * we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + buffer_journal_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } +next: + cn = cn->next; + cond_resched(); + } + return ret; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock; + while (cn) { + /* + * look for a more recent transaction that logged this + * buffer. Only the most recent transaction with a buffer in + * it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn); + if (!pjl && cn->blocknr && cn->bh + && buffer_journal_dirty(cn->bh)) { + BUG_ON(!can_dirty(cn)); + /* + * if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + clear_buffer_journal_new(cn->bh); + if (buffer_journal_prepared(cn->bh)) { + set_buffer_journal_restore_dirty(cn->bh); + } else { + set_buffer_journal_test(cn->bh); + mark_buffer_dirty(cn->bh); + } + } + cn = cn->next; + } + return ret; +} + +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned int *next_trans_id, + int num_blocks, int num_trans) +{ + int ret = 0; + int written = 0; + int transactions_flushed = 0; + unsigned int orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + struct reiserfs_journal *journal = SB_JOURNAL(s); + chunk.nr = 0; + + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* + * we've got j_flush_mutex held, nobody is going to delete any + * of these lists out from underneath us + */ + while ((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left) + || !(jl->j_state & LIST_DIRTY)) { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); + + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &journal->j_journal_list) { + break; + } + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; + } + if (chunk.nr) { + write_chunk(&chunk); + } + +done: + mutex_unlock(&journal->j_flush_mutex); + return ret; +} + +/* + * for o_sync and fsync heavy applications, they tend to use + * all the journa list slots with tiny transactions. These + * trigger lots and lots of calls to update the header block, which + * adds seeks and slows things down. + * + * This function tries to clear out a large chunk of the journal lists + * at once, which makes everything faster since only the newest journal + * list updates the header block + */ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + int limit = 256; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned int trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + + flush_jl = tjl = jl; + + /* in data logging mode, try harder to flush a lot of blocks */ + if (reiserfs_data_log(s)) + limit = 1024; + /* flush for 256 transactions or limit blocks, whichever comes first */ + for (i = 0; i < 256 && len < limit; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &journal->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + get_journal_list(jl); + get_journal_list(flush_jl); + + /* + * try to find a group of blocks we can flush across all the + * transactions, but only bother if we've actually spanned + * across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + put_journal_list(s, flush_jl); + put_journal_list(s, jl); + return 0; +} + +/* + * removes any nodes in table with name block and dev as bh. + * only touchs the hnext and hprev pointers. + */ +void remove_journal_hash(struct super_block *sb, + struct reiserfs_journal_cnode **table, + struct reiserfs_journal_list *jl, + unsigned long block, int remove_freed) +{ + struct reiserfs_journal_cnode *cur; + struct reiserfs_journal_cnode **head; + + head = &(journal_hash(table, sb, block)); + if (!head) { + return; + } + cur = *head; + while (cur) { + if (cur->blocknr == block && cur->sb == sb + && (jl == NULL || jl == cur->jlist) + && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext; + } else { + *head = cur->hnext; + } + cur->blocknr = 0; + cur->sb = NULL; + cur->state = 0; + /* + * anybody who clears the cur->bh will also + * dec the nonzerolen + */ + if (cur->bh && cur->jlist) + atomic_dec(&cur->jlist->j_nonzerolen); + cur->bh = NULL; + cur->jlist = NULL; + } + cur = cur->hnext; + } +} + +static void free_journal_ram(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + kfree(journal->j_current_jl); + journal->j_num_lists--; + + vfree(journal->j_cnode_free_orig); + free_list_bitmaps(sb, journal->j_list_bitmap); + free_bitmap_nodes(sb); /* must be after free_list_bitmaps */ + if (journal->j_header_bh) { + brelse(journal->j_header_bh); + } + /* + * j_header_bh is on the journal dev, make sure + * not to release the journal dev until we brelse j_header_bh + */ + release_journal_dev(sb, journal); + vfree(journal); +} + +/* + * call on unmount. Only set error to 1 if you haven't made your way out + * of read_super() yet. Any other caller must keep error at 0. + */ +static int do_journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb, int error) +{ + struct reiserfs_transaction_handle myth; + int flushed = 0; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + /* + * we only want to flush out transactions if we were + * called with error == 0 + */ + if (!error && !(sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + BUG_ON(!th->t_trans_id); + do_journal_end(th, FLUSH_ALL); + + /* + * make sure something gets logged to force + * our way into the flush code + */ + if (!journal_join(&myth, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); + flushed = 1; + } + } + + /* this also catches errors during the do_journal_end above */ + if (!error && reiserfs_is_journal_aborted(journal)) { + memset(&myth, 0, sizeof(myth)); + if (!journal_join_abort(&myth, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); + } + } + + + /* + * We must release the write lock here because + * the workqueue job (flush_async_commit) needs this lock + */ + reiserfs_write_unlock(sb); + + /* + * Cancel flushing of old commits. Note that neither of these works + * will be requeued because superblock is being shutdown and doesn't + * have MS_ACTIVE set. + */ + cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); + /* wait for all commits to finish */ + cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); + + free_journal_ram(sb); + + reiserfs_write_lock(sb); + + return 0; +} + +/* * call on unmount. flush all journal trans, release all alloc'd ram */ +int journal_release(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 0); +} + +/* only call from an error condition inside reiserfs_read_super! */ +int journal_release_error(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + return do_journal_release(th, sb, 1); +} + +/* + * compares description block with commit block. + * returns 1 if they differ, 0 if they are the same + */ +static int journal_compare_desc_commit(struct super_block *sb, + struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) +{ + if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || + get_commit_trans_len(commit) != get_desc_trans_len(desc) || + get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max || + get_commit_trans_len(commit) <= 0) { + return 1; + } + return 0; +} + +/* + * returns 0 if it did not find a description block + * returns -1 if it found a corrupt commit block + * returns 1 if both desc and commit were valid + * NOTE: only called during fs mount + */ +static int journal_transaction_is_valid(struct super_block *sb, + struct buffer_head *d_bh, + unsigned int *oldest_invalid_trans_id, + unsigned long *newest_mount_id) +{ + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; + unsigned long offset; + + if (!d_bh) + return 0; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (get_desc_trans_len(desc) > 0 + && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id + && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu", + get_desc_trans_id(desc), + *oldest_invalid_trans_id); + return 0; + } + if (newest_mount_id + && *newest_mount_id > get_desc_mount_id(desc)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu", + get_desc_mount_id(desc), + *newest_mount_id); + return -1; + } + if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) { + reiserfs_warning(sb, "journal-2018", + "Bad transaction length %d " + "encountered, ignoring transaction", + get_desc_trans_len(desc)); + return -1; + } + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + + /* + * ok, we have a journal description block, + * let's see if the transaction was valid + */ + c_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((offset + get_desc_trans_len(desc) + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) + return 0; + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + if (oldest_invalid_trans_id) { + *oldest_invalid_trans_id = + get_desc_trans_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d", + get_desc_trans_id(desc)); + } + return -1; + } + brelse(c_bh); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1006: found valid " + "transaction start offset %llu, len %d id %d", + d_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), + get_desc_trans_id(desc)); + return 1; + } else { + return 0; + } +} + +static void brelse_array(struct buffer_head **heads, int num) +{ + int i; + for (i = 0; i < num; i++) { + brelse(heads[i]); + } +} + +/* + * given the start, and values for the oldest acceptable transactions, + * this either reads in a replays a transaction, or returns because the + * transaction is invalid, or too old. + * NOTE: only called during fs mount + */ +static int journal_read_transaction(struct super_block *sb, + unsigned long cur_dblock, + unsigned long oldest_start, + unsigned int oldest_trans_id, + unsigned long newest_mount_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + unsigned int trans_id = 0; + struct buffer_head *c_bh; + struct buffer_head *d_bh; + struct buffer_head **log_blocks = NULL; + struct buffer_head **real_blocks = NULL; + unsigned int trans_offset; + int i; + int trans_half; + + d_bh = journal_bread(sb, cur_dblock); + if (!d_bh) + return 1; + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %llu, len %d mount_id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_desc_trans_len(desc), get_desc_mount_id(desc)); + if (get_desc_trans_id(desc) < oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old", + cur_dblock - + SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + brelse(d_bh); + return 1; + } + if (get_desc_mount_id(desc) != newest_mount_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu", get_desc_mount_id(desc), + newest_mount_id); + brelse(d_bh); + return 1; + } + c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + 1) % + SB_ONDISK_JOURNAL_SIZE(sb))); + if (!c_bh) { + brelse(d_bh); + return 1; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(sb, desc, commit)) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal_read_transaction, " + "commit offset %llu had bad time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + brelse(d_bh); + return 1; + } + + if (bdev_read_only(sb->s_bdev)) { + reiserfs_warning(sb, "clm-2076", + "device is readonly, unable to replay log"); + brelse(c_bh); + brelse(d_bh); + return -EROFS; + } + + trans_id = get_desc_trans_id(desc); + /* + * now we know we've got a good transaction, and it was + * inside the valid time ranges + */ + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + if (!log_blocks || !real_blocks) { + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + reiserfs_warning(sb, "journal-1169", + "kmalloc failed, unable to mount FS"); + return -1; + } + /* get all the buffer heads */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0; i < get_desc_trans_len(desc); i++) { + log_blocks[i] = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + (trans_offset + 1 + + i) % SB_ONDISK_JOURNAL_SIZE(sb)); + if (i < trans_half) { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(desc->j_realblock[i])); + } else { + real_blocks[i] = + sb_getblk(sb, + le32_to_cpu(commit-> + j_realblock[i - trans_half])); + } + if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) { + reiserfs_warning(sb, "journal-1207", + "REPLAY FAILURE fsck required! " + "Block to replay is outside of " + "filesystem"); + goto abort_replay; + } + /* make sure we don't try to replay onto log or reserved area */ + if (is_block_in_log_or_reserved_area + (sb, real_blocks[i]->b_blocknr)) { + reiserfs_warning(sb, "journal-1204", + "REPLAY FAILURE fsck required! " + "Trying to replay onto a log block"); +abort_replay: + brelse_array(log_blocks, i); + brelse_array(real_blocks, i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + for (i = 0; i < get_desc_trans_len(desc); i++) { + + wait_on_buffer(log_blocks[i]); + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning(sb, "journal-1212", + "REPLAY FAILURE fsck required! " + "buffer write failed"); + brelse_array(log_blocks + i, + get_desc_trans_len(desc) - i); + brelse_array(real_blocks, get_desc_trans_len(desc)); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, + real_blocks[i]->b_size); + set_buffer_uptodate(real_blocks[i]); + brelse(log_blocks[i]); + } + /* flush out the real blocks */ + for (i = 0; i < get_desc_trans_len(desc); i++) { + set_buffer_dirty(real_blocks[i]); + write_dirty_buffer(real_blocks[i], WRITE); + } + for (i = 0; i < get_desc_trans_len(desc); i++) { + wait_on_buffer(real_blocks[i]); + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning(sb, "journal-1226", + "REPLAY FAILURE, fsck required! " + "buffer write failed"); + brelse_array(real_blocks + i, + get_desc_trans_len(desc) - i); + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return -1; + } + brelse(real_blocks[i]); + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((trans_offset + get_desc_trans_len(desc) + + 2) % SB_ONDISK_JOURNAL_SIZE(sb)); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1095: setting journal " "start to offset %ld", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + + /* + * init starting values for the first transaction, in case + * this is the last transaction to be replayed. + */ + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); + journal->j_last_flush_trans_id = trans_id; + journal->j_trans_id = trans_id + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + brelse(c_bh); + brelse(d_bh); + kfree(log_blocks); + kfree(real_blocks); + return 0; +} + +/* + * This function reads blocks starting from block and to max_block of bufsize + * size (but no more than BUFNR blocks at a time). This proved to improve + * mounting speed on self-rebuilding raid5 arrays at least. + * Right now it is only used from journal code. But later we might use it + * from other places. + * Note: Do not use journal_getblk/sb_getblk functions here! + */ +static struct buffer_head *reiserfs_breada(struct block_device *dev, + b_blocknr_t block, int bufsize, + b_blocknr_t max_block) +{ + struct buffer_head *bhlist[BUFNR]; + unsigned int blocks = BUFNR; + struct buffer_head *bh; + int i, j; + + bh = __getblk(dev, block, bufsize); + if (buffer_uptodate(bh)) + return (bh); + + if (block + BUFNR > max_block) { + blocks = max_block - block; + } + bhlist[0] = bh; + j = 1; + for (i = 1; i < blocks; i++) { + bh = __getblk(dev, block + i, bufsize); + if (buffer_uptodate(bh)) { + brelse(bh); + break; + } else + bhlist[j++] = bh; + } + ll_rw_block(READ, j, bhlist); + for (i = 1; i < j; i++) + brelse(bhlist[i]); + bh = bhlist[0]; + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + brelse(bh); + return NULL; +} + +/* + * read and replay the log + * on a clean unmount, the journal header's next unflushed pointer will be + * to an invalid transaction. This tests that before finding all the + * transactions in the log, which makes normal mount times fast. + * + * After a crash, this starts with the next unflushed transaction, and + * replays until it finds one too old, or invalid. + * + * On exit, it sets things up so the first transaction will work correctly. + * NOTE: only called during fs mount + */ +static int journal_read(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_desc *desc; + unsigned int oldest_trans_id = 0; + unsigned int oldest_invalid_trans_id = 0; + time_t start; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0; + unsigned long newest_mount_id = 9; + struct buffer_head *d_bh; + struct reiserfs_journal_header *jh; + int valid_journal_header = 0; + int replay_count = 0; + int continue_replay = 1; + int ret; + char b[BDEVNAME_SIZE]; + + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); + reiserfs_info(sb, "checking transaction log (%s)\n", + bdevname(journal->j_dev_bd, b)); + start = get_seconds(); + + /* + * step 1, read in the journal header block. Check the transaction + * it says is the first unflushed, and if that transaction is not + * valid, replay is done + */ + journal->j_header_bh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!journal->j_header_bh) { + return 1; + } + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); + if (le32_to_cpu(jh->j_first_unflushed_offset) < + SB_ONDISK_JOURNAL_SIZE(sb) + && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + oldest_start = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset); + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + newest_mount_id = le32_to_cpu(jh->j_mount_id); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu", le32_to_cpu(jh->j_first_unflushed_offset), + le32_to_cpu(jh->j_last_flush_trans_id)); + valid_journal_header = 1; + + /* + * now, we try to read the first unflushed offset. If it + * is not valid, there is nothing more we can do, and it + * makes no sense to read through the whole log. + */ + d_bh = + journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + le32_to_cpu(jh->j_first_unflushed_offset)); + ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL); + if (!ret) { + continue_replay = 0; + } + brelse(d_bh); + goto start_log_replay; + } + + /* + * ok, there are transactions that need to be replayed. start + * with the first log block, find all the valid transactions, and + * pick out the oldest. + */ + while (continue_replay + && cur_dblock < + (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb))) { + /* + * Note that it is required for blocksize of primary fs + * device and journal device to be the same + */ + d_bh = + reiserfs_breada(journal->j_dev_bd, cur_dblock, + sb->s_blocksize, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + ret = + journal_transaction_is_valid(sb, d_bh, + &oldest_invalid_trans_id, + &newest_mount_id); + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1179: Setting " + "oldest_start to offset %llu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } else if (oldest_trans_id > get_desc_trans_id(desc)) { + /* one we just read was older */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (sb), oldest_trans_id); + } + if (newest_mount_id < get_desc_mount_id(desc)) { + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1299: Setting " + "newest_mount_id to %d", + get_desc_mount_id(desc)); + } + cur_dblock += get_desc_trans_len(desc) + 2; + } else { + cur_dblock++; + } + brelse(d_bh); + } + +start_log_replay: + cur_dblock = oldest_start; + if (oldest_trans_id) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1206: Starting replay " + "from offset %llu, trans_id %lu", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb), + oldest_trans_id); + + } + replay_count = 0; + while (continue_replay && oldest_trans_id > 0) { + ret = + journal_read_transaction(sb, cur_dblock, oldest_start, + oldest_trans_id, newest_mount_id); + if (ret < 0) { + return ret; + } else if (ret != 0) { + break; + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start; + replay_count++; + if (cur_dblock == oldest_start) + break; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "journal-1225: No valid " "transactions found"); + } + /* + * j_start does not get set correctly if we don't replay any + * transactions. if we had a valid journal_header, set j_start + * to the first unflushed transaction value, copy the trans_id + * from the header + */ + if (valid_journal_header && replay_count == 0) { + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); + journal->j_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id) + 1; + /* check for trans_id overflow */ + if (journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_last_flush_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id); + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + journal->j_mount_id = newest_mount_id + 1; + } + reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %lu", journal->j_mount_id); + journal->j_first_unflushed_offset = journal->j_start; + if (replay_count > 0) { + reiserfs_info(sb, + "replayed %d transactions in %lu seconds\n", + replay_count, get_seconds() - start); + } + /* needed to satisfy the locking in _update_journal_header_block */ + reiserfs_write_lock(sb); + if (!bdev_read_only(sb->s_bdev) && + _update_journal_header_block(sb, journal->j_start, + journal->j_last_flush_trans_id)) { + reiserfs_write_unlock(sb); + /* + * replay failed, caller must call free_journal_ram and abort + * the mount + */ + return -1; + } + reiserfs_write_unlock(sb); + return 0; +} + +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + mutex_init(&jl->j_commit_mutex); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; +} + +static void journal_list_init(struct super_block *sb) +{ + SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); +} + +static void release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal) +{ + if (journal->j_dev_bd != NULL) { + blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + journal->j_dev_bd = NULL; + } +} + +static int journal_init_dev(struct super_block *super, + struct reiserfs_journal *journal, + const char *jdev_name) +{ + int result; + dev_t jdev; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; + char b[BDEVNAME_SIZE]; + + result = 0; + + journal->j_dev_bd = NULL; + jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; + + if (bdev_read_only(super->s_bdev)) + blkdev_mode = FMODE_READ; + + /* there is no "jdev" option and journal is on separate device */ + if ((!jdev_name || !jdev_name[0])) { + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); + journal->j_dev_mode = blkdev_mode; + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, "sh-458", + "cannot init journal device '%s': %i", + __bdevname(jdev, b), result); + return result; + } else if (jdev != super->s_dev) + set_blocksize(journal->j_dev_bd, super->s_blocksize); + + return 0; + } + + journal->j_dev_mode = blkdev_mode; + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; + reiserfs_warning(super, + "journal_init_dev: Cannot open '%s': %i", + jdev_name, result); + return result; + } + + set_blocksize(journal->j_dev_bd, super->s_blocksize); + reiserfs_info(super, + "journal_init_dev: journal device: %s\n", + bdevname(journal->j_dev_bd, b)); + return 0; +} + +/* + * When creating/tuning a file system user can assign some + * journal params within boundaries which depend on the ratio + * blocksize/standard_blocksize. + * + * For blocks >= standard_blocksize transaction size should + * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more + * then JOURNAL_TRANS_MAX_DEFAULT. + * + * For blocks < standard_blocksize these boundaries should be + * decreased proportionally. + */ +#define REISERFS_STANDARD_BLKSIZE (4096) + +static int check_advise_trans_params(struct super_block *sb, + struct reiserfs_journal *journal) +{ + if (journal->j_trans_max) { + /* Non-default journal params. Do sanity check for them. */ + int ratio = 1; + if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) + ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; + + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || + journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || + SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max < + JOURNAL_MIN_RATIO) { + reiserfs_warning(sb, "sh-462", + "bad transaction max size (%u). " + "FSCK?", journal->j_trans_max); + return 1; + } + if (journal->j_max_batch != (journal->j_trans_max) * + JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { + reiserfs_warning(sb, "sh-463", + "bad transaction max batch (%u). " + "FSCK?", journal->j_max_batch); + return 1; + } + } else { + /* + * Default journal params. + * The file system was created by old version + * of mkreiserfs, so some fields contain zeros, + * and we need to advise proper values for them + */ + if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { + reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", + sb->s_blocksize); + return 1; + } + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; + } + return 0; +} + +/* must be called once on fs mount. calls journal_read for you */ +int journal_init(struct super_block *sb, const char *j_dev_name, + int old_format, unsigned int commit_max_age) +{ + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2; + struct buffer_head *bhjh; + struct reiserfs_super_block *rs; + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; + char b[BDEVNAME_SIZE]; + int ret; + + journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); + if (!journal) { + reiserfs_warning(sb, "journal-1256", + "unable to get memory for journal structure"); + return 1; + } + INIT_LIST_HEAD(&journal->j_bitmap_nodes); + INIT_LIST_HEAD(&journal->j_prealloc_list); + INIT_LIST_HEAD(&journal->j_working_list); + INIT_LIST_HEAD(&journal->j_journal_list); + journal->j_persistent_trans = 0; + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) + goto free_and_return; + + allocate_bitmap_nodes(sb); + + /* reserved for journal area support */ + SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ? + REISERFS_OLD_DISK_OFFSET_IN_BYTES + / sb->s_blocksize + + reiserfs_bmap_count(sb) + + 1 : + REISERFS_DISK_OFFSET_IN_BYTES / + sb->s_blocksize + 2); + + /* + * Sanity check to see is the standard journal fitting + * within first bitmap (actual for small blocksizes) + */ + if (!SB_ONDISK_JOURNAL_DEVICE(sb) && + (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { + reiserfs_warning(sb, "journal-1393", + "journal does not fit for area addressed " + "by first of bitmap blocks. It starts at " + "%u and its size is %u. Block size %ld", + SB_JOURNAL_1st_RESERVED_BLOCK(sb), + SB_ONDISK_JOURNAL_SIZE(sb), + sb->s_blocksize); + goto free_and_return; + } + + if (journal_init_dev(sb, journal, j_dev_name) != 0) { + reiserfs_warning(sb, "sh-462", + "unable to initialize journal device"); + goto free_and_return; + } + + rs = SB_DISK_SUPER_BLOCK(sb); + + /* read journal header */ + bhjh = journal_bread(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + SB_ONDISK_JOURNAL_SIZE(sb)); + if (!bhjh) { + reiserfs_warning(sb, "sh-459", + "unable to read journal header"); + goto free_and_return; + } + jh = (struct reiserfs_journal_header *)(bhjh->b_data); + + /* make sure that journal matches to the super block */ + if (is_reiserfs_jr(rs) + && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != + sb_jp_journal_magic(rs))) { + reiserfs_warning(sb, "sh-460", + "journal header magic %x (device %s) does " + "not match to magic found in super block %x", + jh->jh_journal.jp_journal_magic, + bdevname(journal->j_dev_bd, b), + sb_jp_journal_magic(rs)); + brelse(bhjh); + goto free_and_return; + } + + journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); + journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); + journal->j_max_commit_age = + le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + + if (check_advise_trans_params(sb, journal) != 0) + goto free_and_return; + journal->j_default_max_commit_age = journal->j_max_commit_age; + + if (commit_max_age != 0) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + + reiserfs_info(sb, "journal params: device %s, size %u, " + "journal first block %u, max trans len %u, max batch %u, " + "max commit age %u, max trans age %u\n", + bdevname(journal->j_dev_bd, b), + SB_ONDISK_JOURNAL_SIZE(sb), + SB_ONDISK_JOURNAL_1st_BLOCK(sb), + journal->j_trans_max, + journal->j_max_batch, + journal->j_max_commit_age, journal->j_max_trans_age); + + brelse(bhjh); + + journal->j_list_bitmap_index = 0; + journal_list_init(sb); + + memset(journal->j_list_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); + + INIT_LIST_HEAD(&journal->j_dirty_buffers); + spin_lock_init(&journal->j_dirty_buffers_lock); + + journal->j_start = 0; + journal->j_len = 0; + journal->j_len_alloc = 0; + atomic_set(&journal->j_wcount, 0); + atomic_set(&journal->j_async_throttle, 0); + journal->j_bcount = 0; + journal->j_trans_start_time = 0; + journal->j_last = NULL; + journal->j_first = NULL; + init_waitqueue_head(&journal->j_join_wait); + mutex_init(&journal->j_mutex); + mutex_init(&journal->j_flush_mutex); + + journal->j_trans_id = 10; + journal->j_mount_id = 10; + journal->j_state = 0; + atomic_set(&journal->j_jlock, 0); + journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + journal->j_cnode_free_orig = journal->j_cnode_free_list; + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; + journal->j_cnode_used = 0; + journal->j_must_wait = 0; + + if (journal->j_cnode_free == 0) { + reiserfs_warning(sb, "journal-2004", "Journal cnode memory " + "allocation failed (%ld bytes). Journal is " + "too large for available memory. Usually " + "this is due to a journal that is too large.", + sizeof (struct reiserfs_journal_cnode) * num_cnodes); + goto free_and_return; + } + + init_journal_hash(sb); + jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); + jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); + if (!jl->j_list_bitmap) { + reiserfs_warning(sb, "journal-2005", + "get_list_bitmap failed for journal list 0"); + goto free_and_return; + } + + ret = journal_read(sb); + if (ret < 0) { + reiserfs_warning(sb, "reiserfs-2006", + "Replay Failure, unable to mount"); + goto free_and_return; + } + + INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); + journal->j_work_sb = sb; + return 0; +free_and_return: + free_journal_ram(sb); + return 1; +} + +/* + * test for a polite end of the current transaction. Used by file_write, + * and should be used by delete to make sure they don't write more than + * can fit inside a single transaction + */ +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, + int new_alloc) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + time_t now = get_seconds(); + /* cannot restart while nested */ + BUG_ON(!th->t_trans_id); + if (th->t_refcount > 1) + return 0; + if (journal->j_must_wait > 0 || + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || + atomic_read(&journal->j_jlock) || + (now - journal->j_trans_start_time) > journal->j_max_trans_age || + journal->j_cnode_free < (journal->j_trans_max * 3)) { + return 1; + } + + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; + return 0; +} + +/* this must be called inside a transaction */ +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + BUG_ON(!th->t_trans_id); + journal->j_must_wait = 1; + set_bit(J_WRITERS_BLOCKED, &journal->j_state); + return; +} + +/* this must be called without a transaction started */ +void reiserfs_allow_writes(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + clear_bit(J_WRITERS_BLOCKED, &journal->j_state); + wake_up(&journal->j_join_wait); +} + +/* this must be called without a transaction started */ +void reiserfs_wait_on_write_block(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + wait_event(journal->j_join_wait, + !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); +} + +static void queue_log_writer(struct super_block *s) +{ + wait_queue_t wait; + struct reiserfs_journal *journal = SB_JOURNAL(s); + set_bit(J_WRITERS_QUEUED, &journal->j_state); + + /* + * we don't want to use wait_event here because + * we only want to wait once. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&journal->j_join_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { + int depth = reiserfs_write_unlock_nested(s); + schedule(); + reiserfs_write_lock_nested(s, depth); + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(&journal->j_join_wait, &wait); +} + +static void wake_queued_writers(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) + wake_up(&journal->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned long bcount = journal->j_bcount; + while (1) { + int depth; + + depth = reiserfs_write_unlock_nested(sb); + schedule_timeout_uninterruptible(1); + reiserfs_write_lock_nested(sb, depth); + + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; + while ((atomic_read(&journal->j_wcount) > 0 || + atomic_read(&journal->j_jlock)) && + journal->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (journal->j_trans_id != trans_id) + break; + if (bcount == journal->j_bcount) + break; + bcount = journal->j_bcount; + } +} + +/* + * join == true if you must join an existing transaction. + * join == false if you can deal with waiting for others to finish + * + * this will block until the transaction is joinable. send the number of + * blocks you expect to use in nblocks. +*/ +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks, + int join) +{ + time_t now = get_seconds(); + unsigned int old_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; + int retval; + int depth; + + reiserfs_check_lock_depth(sb, "journal_begin"); + BUG_ON(nblocks > journal->j_trans_max); + + PROC_INFO_INC(sb, journal.journal_being); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = sb; + +relock: + lock_journal(sb); + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { + unlock_journal(sb); + retval = journal->j_errno; + goto out_fail; + } + journal->j_bcount++; + + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { + unlock_journal(sb); + depth = reiserfs_write_unlock_nested(sb); + reiserfs_wait_on_write_block(sb); + reiserfs_write_lock_nested(sb, depth); + PROC_INFO_INC(sb, journal.journal_relock_writers); + goto relock; + } + now = get_seconds(); + + /* + * if there is no room in the journal OR + * if this transaction is too old, and we weren't called joinable, + * wait for it to finish before beginning we don't sleep if there + * aren't other writers + */ + + if ((!join && journal->j_must_wait > 0) || + (!join + && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) + || (!join && atomic_read(&journal->j_wcount) > 0 + && journal->j_trans_start_time > 0 + && (now - journal->j_trans_start_time) > + journal->j_max_trans_age) || (!join + && atomic_read(&journal->j_jlock)) + || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { + + old_trans_id = journal->j_trans_id; + /* allow others to finish this transaction */ + unlock_journal(sb); + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + journal->j_max_batch && + ((journal->j_len + nblocks + 2) * 100) < + (journal->j_len_alloc * 75)) { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(sb); + goto relock; + } + } + /* + * don't mess with joining the transaction if all we + * have to do is wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } + goto relock; + } + retval = journal_join(&myth, sb); + if (retval) + goto out_fail; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != journal->j_trans_id) { + retval = do_journal_end(&myth, 0); + } else { + retval = do_journal_end(&myth, COMMIT_NOW); + } + + if (retval) + goto out_fail; + + PROC_INFO_INC(sb, journal.journal_relock_wcount); + goto relock; + } + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); + } + atomic_inc(&journal->j_wcount); + journal->j_len_alloc += nblocks; + th->t_blocks_logged = 0; + th->t_blocks_allocated = nblocks; + th->t_trans_id = journal->j_trans_id; + unlock_journal(sb); + INIT_LIST_HEAD(&th->t_list); + return 0; + +out_fail: + memset(th, 0, sizeof(*th)); + /* + * Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later + */ + th->t_super = sb; + return retval; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *s, + int nblocks) +{ + int ret; + struct reiserfs_transaction_handle *th; + + /* + * if we're nesting into an existing transaction. It will be + * persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info; + th->t_refcount++; + BUG_ON(th->t_refcount < 2); + + return th; + } + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks); + if (ret) { + kfree(th); + return NULL; + } + + SB_JOURNAL(s)->j_persistent_trans++; + return th; +} + +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + int ret = 0; + if (th->t_trans_id) + ret = journal_end(th); + else + ret = -EIO; + if (th->t_refcount == 0) { + SB_JOURNAL(s)->j_persistent_trans--; + kfree(th); + } + return ret; +} + +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); +} + +int journal_join_abort(struct reiserfs_transaction_handle *th, + struct super_block *sb) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer + */ + th->t_handle_save = cur_th; + BUG_ON(cur_th && cur_th->t_refcount > 1); + return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); +} + +int journal_begin(struct reiserfs_transaction_handle *th, + struct super_block *sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + int ret; + + th->t_handle_save = NULL; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == sb) { + BUG_ON(!cur_th->t_refcount); + cur_th->t_refcount++; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + reiserfs_warning(sb, "reiserfs-2005", + "BAD: refcount <= 1, but " + "journal_info != 0"); + return 0; + } else { + /* + * we've ended up with a handle from a different + * filesystem. save it and restore on journal_end. + * This should never really happen... + */ + reiserfs_warning(sb, "clm-2100", + "nesting info a different FS"); + th->t_handle_save = current->journal_info; + current->journal_info = th; + } + } else { + current->journal_info = th; + } + ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); + BUG_ON(current->journal_info != th); + + /* + * I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since + * journal_end won't be called to do it. */ + if (ret) + current->journal_info = th->t_handle_save; + else + BUG_ON(!th->t_refcount); + + return ret; +} + +/* + * puts bh into the current transaction. If it was already there, reorders + * removes the old pointers from the hash, and puts new ones in (to make + * sure replay happen in the right order). + * + * if it was dirty, cleans and files onto the clean list. I can't let it + * be dirty again until the transaction is committed. + * + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. + */ +int journal_mark_dirty(struct reiserfs_transaction_handle *th, + struct buffer_head *bh) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0; + int prepared = 0; + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(sb, journal.mark_dirty); + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + prepared = test_clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + PROC_INFO_INC(sb, journal.mark_dirty_already); + return 0; + } + + /* + * this must be turned into a panic instead of a warning. We can't + * allow a dirty or journal_dirty or locked buffer to be logged, as + * some changes could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_dirty(bh)) { + reiserfs_warning(sb, "journal-1777", + "buffer %llu bad state " + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", + (unsigned long long)bh->b_blocknr, + prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!'); + } + + if (atomic_read(&journal->j_wcount) <= 0) { + reiserfs_warning(sb, "journal-1409", + "returning because j_wcount was %d", + atomic_read(&journal->j_wcount)); + return 1; + } + /* + * this error means I've screwed up, and we've overflowed + * the transaction. Nothing can be done here, except make the + * FS readonly or panic. + */ + if (journal->j_len >= journal->j_trans_max) { + reiserfs_panic(th->t_super, "journal-1413", + "j_len (%lu) is too big", + journal->j_len); + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1; + PROC_INFO_INC(sb, journal.mark_dirty_notjournal); + clear_buffer_journal_dirty(bh); + } + + if (journal->j_len > journal->j_len_alloc) { + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; + } + + set_buffer_journaled(bh); + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(sb); + if (!cn) { + reiserfs_panic(sb, "journal-4", "get_cnode failed!"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; + } + th->t_blocks_logged++; + journal->j_len++; + + cn->bh = bh; + cn->blocknr = bh->b_blocknr; + cn->sb = sb; + cn->jlist = NULL; + insert_journal_hash(journal->j_hash_table, cn); + if (!count_already_incd) { + get_bh(bh); + } + } + cn->next = NULL; + cn->prev = journal->j_last; + cn->bh = bh; + if (journal->j_last) { + journal->j_last->next = cn; + journal->j_last = cn; + } else { + journal->j_first = cn; + journal->j_last = cn; + } + reiserfs_schedule_old_flush(sb); + return 0; +} + +int journal_end(struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + if (!current->journal_info && th->t_refcount > 1) + reiserfs_warning(sb, "REISER-NESTING", + "th NULL, refcount %d", th->t_refcount); + + if (!th->t_trans_id) { + WARN_ON(1); + return -EIO; + } + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = + current->journal_info; + + /* + * we aren't allowed to close a nested transaction on a + * different filesystem from the one in the task struct + */ + BUG_ON(cur_th->t_super != th->t_super); + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, 0); + } +} + +/* + * removes from the current transaction, relsing and descrementing any counters. + * also files the removed buffer directly onto the clean list + * + * called by journal_mark_freed when a block has been deleted + * + * returns 1 if it cleaned and relsed the buffer. 0 otherwise + */ +static int remove_from_transaction(struct super_block *sb, + b_blocknr_t blocknr, int already_cleaned) +{ + struct buffer_head *bh; + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (!cn || !cn->bh) { + return ret; + } + bh = cn->bh; + if (cn->prev) { + cn->prev->next = cn->next; + } + if (cn->next) { + cn->next->prev = cn->prev; + } + if (cn == journal->j_first) { + journal->j_first = cn->next; + } + if (cn == journal->j_last) { + journal->j_last = cn->prev; + } + if (bh) + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); + clear_buffer_journaled(bh); /* don't log this one */ + + if (!already_cleaned) { + clear_buffer_journal_dirty(bh); + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + put_bh(bh); + if (atomic_read(&bh->b_count) < 0) { + reiserfs_warning(sb, "journal-1752", + "b_count < 0"); + } + ret = 1; + } + journal->j_len--; + journal->j_len_alloc--; + free_cnode(sb, cn); + return ret; +} + +/* + * for any cnode in a journal list, it can only be dirtied of all the + * transactions that include it are committed to disk. + * this checks through each transaction, and returns 1 if you are allowed + * to dirty, and 0 if you aren't + * + * it is called by dirty_journal_list, which is called after + * flush_commit_list has gotten all the log blocks for a given + * transaction on disk + * + */ +static int can_dirty(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + struct reiserfs_journal_cnode *cur = cn->hprev; + int can_dirty = 1; + + /* + * first test hprev. These are all newer than cn, so any node here + * with the same block number and dev means this node can't be sent + * to disk right now. + */ + while (cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && + cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hprev; + } + /* + * then test hnext. These are all older than cn. As long as they + * are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext; + while (cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hnext; + } + return can_dirty; +} + +/* + * syncs the commit blocks, but does not force the real buffers to disk + * will wait until the current transaction is done/committed before returning + */ +int journal_end_sync(struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + /* you can sync while nested, very, very bad */ + BUG_ON(th->t_refcount > 1); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); + } + return do_journal_end(th, COMMIT_NOW | WAIT); +} + +/* writeback the pending async commits to disk */ +static void flush_async_commits(struct work_struct *work) +{ + struct reiserfs_journal *journal = + container_of(work, struct reiserfs_journal, j_work.work); + struct super_block *sb = journal->j_work_sb; + struct reiserfs_journal_list *jl; + struct list_head *entry; + + reiserfs_write_lock(sb); + if (!list_empty(&journal->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = journal->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(sb, jl, 1); + } + reiserfs_write_unlock(sb); +} + +/* + * flushes any old transactions to disk + * ends the current transaction if it is too old + */ +void reiserfs_flush_old_commits(struct super_block *sb) +{ + time_t now; + struct reiserfs_transaction_handle th; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + now = get_seconds(); + /* + * safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&journal->j_journal_list)) + return; + + /* + * check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&journal->j_wcount) <= 0 && + journal->j_trans_start_time > 0 && + journal->j_len > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) { + if (!journal_join(&th, sb)) { + reiserfs_prepare_for_journal(sb, + SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + + /* + * we're only being called from kreiserfsd, it makes + * no sense to do an async commit so that kreiserfsd + * can do it later + */ + do_journal_end(&th, COMMIT_NOW | WAIT); + } + } +} + +/* + * returns 0 if do_journal_end should return right away, returns 1 if + * do_journal_end should finish the commit + * + * if the current transaction is too old, but still has writers, this will + * wait on j_join_wait until all the writers are done. By the time it + * wakes up, the transaction it was called has already ended, so it just + * flushes the commit list and returns 0. + * + * Won't batch when flush or commit_now is set. Also won't batch when + * others are waiting on j_join_wait. + * + * Note, we can't allow the journal_end to proceed while there are still + * writers in the log. + */ +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) +{ + + time_t now; + int flush = flags & FLUSH_ALL; + int commit_now = flags & COMMIT_NOW; + int wait_on_commit = flags & WAIT; + struct reiserfs_journal_list *jl; + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + + BUG_ON(!th->t_trans_id); + + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577", + "handle trans id %ld != current trans id %ld", + th->t_trans_id, journal->j_trans_id); + } + + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); + /* <= 0 is allowed. unmounting might not call begin */ + if (atomic_read(&journal->j_wcount) > 0) + atomic_dec(&journal->j_wcount); + + /* + * BUG, deal with case where j_len is 0, but people previously + * freed blocks need to be released will be dealt with by next + * transaction that actually writes something, but should be taken + * care of in this trans + */ + BUG_ON(journal->j_len == 0); + + /* + * if wcount > 0, and we are called to with flush or commit_now, + * we wait on j_join_wait. We will wake up when the last writer has + * finished the transaction, and started it on its way to the disk. + * Then, we flush the commit or journal list, and just return 0 + * because the rest of journal end was already done for this + * transaction. + */ + if (atomic_read(&journal->j_wcount) > 0) { + if (flush || commit_now) { + unsigned trans_id; + + jl = journal->j_current_jl; + trans_id = jl->j_trans_id; + if (wait_on_commit) + jl->j_state |= LIST_COMMIT_PENDING; + atomic_set(&journal->j_jlock, 1); + if (flush) { + journal->j_next_full_flush = 1; + } + unlock_journal(sb); + + /* + * sleep while the current transaction is + * still j_jlocked + */ + while (journal->j_trans_id == trans_id) { + if (atomic_read(&journal->j_jlock)) { + queue_log_writer(sb); + } else { + lock_journal(sb); + if (journal->j_trans_id == trans_id) { + atomic_set(&journal->j_jlock, + 1); + } + unlock_journal(sb); + } + } + BUG_ON(journal->j_trans_id == trans_id); + + if (commit_now + && journal_list_still_alive(sb, trans_id) + && wait_on_commit) { + flush_commit_list(sb, jl, 1); + } + return 0; + } + unlock_journal(sb); + return 0; + } + + /* deal with old transactions where we are the last writers */ + now = get_seconds(); + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { + commit_now = 1; + journal->j_next_async_flush = 1; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) + && !flush && !commit_now && (journal->j_len < journal->j_max_batch) + && journal->j_len_alloc < journal->j_max_batch + && journal->j_cnode_free > (journal->j_trans_max * 3)) { + journal->j_bcount++; + unlock_journal(sb); + return 0; + } + + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) { + reiserfs_panic(sb, "journal-003", + "j_start (%ld) is too high", + journal->j_start); + } + return 1; +} + +/* + * Does all the work that makes deleting blocks safe. + * when deleting a block mark BH_JNew, just remove it from the current + * transaction, clean it's buffer_head and move on. + * + * otherwise: + * set a bit for the block in the journal bitmap. That will prevent it from + * being allocated for unformatted nodes before this transaction has finished. + * + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. + * That will prevent any old transactions with this block from trying to flush + * to the real location. Since we aren't removing the cnode from the + * journal_list_hash, *the block can't be reallocated yet. + * + * Then remove it from the current transaction, decrementing any counters and + * filing it on the clean list. + */ +int journal_mark_freed(struct reiserfs_transaction_handle *th, + struct super_block *sb, b_blocknr_t blocknr) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn = NULL; + struct buffer_head *bh = NULL; + struct reiserfs_list_bitmap *jb = NULL; + int cleaned = 0; + BUG_ON(!th->t_trans_id); + + cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh; + get_bh(bh); + } + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_buffer_journal_new(bh); + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + cleaned = remove_from_transaction(sb, blocknr, cleaned); + } else { + /* + * set the bit for this block in the journal bitmap + * for this transaction + */ + jb = journal->j_current_jl->j_list_bitmap; + if (!jb) { + reiserfs_panic(sb, "journal-1702", + "journal_list_bitmap is NULL"); + } + set_bit_in_list_bitmap(sb, blocknr, jb); + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + } + cleaned = remove_from_transaction(sb, blocknr, cleaned); + + /* + * find all older transactions with this block, + * make sure they don't try to write it out + */ + cn = get_journal_hash_dev(sb, journal->j_list_hash_table, + blocknr); + while (cn) { + if (sb == cn->sb && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state); + if (cn->bh) { + /* + * remove_from_transaction will brelse + * the buffer if it was in the current + * trans + */ + if (!cleaned) { + clear_buffer_journal_dirty(cn-> + bh); + clear_buffer_dirty(cn->bh); + clear_buffer_journal_test(cn-> + bh); + cleaned = 1; + put_bh(cn->bh); + if (atomic_read + (&cn->bh->b_count) < 0) { + reiserfs_warning(sb, + "journal-2138", + "cn->bh->b_count < 0"); + } + } + /* + * since we are clearing the bh, + * we MUST dec nonzerolen + */ + if (cn->jlist) { + atomic_dec(&cn->jlist-> + j_nonzerolen); + } + cn->bh = NULL; + } + } + cn = cn->hnext; + } + } + + if (bh) + release_buffer_page(bh); /* get_hash grabs the buffer */ + return 0; +} + +void reiserfs_update_inode_transaction(struct inode *inode) +{ + struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); + REISERFS_I(inode)->i_jl = journal->j_current_jl; + REISERFS_I(inode)->i_trans_id = journal->j_trans_id; +} + +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th; + struct super_block *sb = inode->i_sb; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + /* + * is it from the current transaction, + * or from an unknown transaction? + */ + if (id == journal->j_trans_id) { + jl = journal->j_current_jl; + /* + * try to let other writers come in and + * grow this transaction + */ + let_transaction_grow(sb, id); + if (journal->j_trans_id != id) { + goto flush_commit_only; + } + + ret = journal_begin(&th, sb, 1); + if (ret) + return ret; + + /* someone might have ended this transaction while we joined */ + if (journal->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th); + goto flush_commit_only; + } + + ret = journal_end_sync(&th); + if (!ret) + ret = 1; + + } else { + /* + * this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ +flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1); + if (journal->j_errno) + ret = journal->j_errno; + } + } + /* otherwise the list is gone, and long since committed */ + return ret; +} + +int reiserfs_commit_for_inode(struct inode *inode) +{ + unsigned int id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* + * for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode); + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); +} + +void reiserfs_restore_prepared_buffer(struct super_block *sb, + struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + PROC_INFO_INC(sb, journal.restore_prepared); + if (!bh) { + return; + } + if (test_clear_buffer_journal_restore_dirty(bh) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + reiserfs_write_lock(sb); + cn = get_journal_hash_dev(sb, + journal->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_buffer_journal_test(bh); + mark_buffer_dirty(bh); + } + reiserfs_write_unlock(sb); + } + clear_buffer_journal_prepared(bh); +} + +extern struct tree_balance *cur_tb; +/* + * before we can change a metadata block, we have to make sure it won't + * be written to disk while we are altering it. So, we must: + * clean it + * wait on it. + */ +int reiserfs_prepare_for_journal(struct super_block *sb, + struct buffer_head *bh, int wait) +{ + PROC_INFO_INC(sb, journal.prepare); + + if (!trylock_buffer(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_buffer_journal_prepared(bh); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_buffer_journal_test(bh); + set_buffer_journal_restore_dirty(bh); + } + unlock_buffer(bh); + return 1; +} + +/* + * long and ugly. If flush, will not return until all commit + * blocks and all real buffers in the trans are on disk. + * If no_async, won't return until all commit blocks are on disk. + * + * keep reading, there are comments as you go along + * + * If the journal is aborted, we just clean up. Things like flushing + * journal lists, etc just won't happen. + */ +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) +{ + struct super_block *sb = th->t_super; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; /* commit bh */ + struct buffer_head *d_bh; /* desc bh */ + int cur_write_start = 0; /* start index of current log write */ + int old_start; + int i; + int flush; + int wait_on_commit; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned int commit_trans_id; + int trans_half; + int depth; + + BUG_ON(th->t_refcount > 1); + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_super); + + /* + * protect flush_older_commits from doing mistakes if the + * transaction ID counter gets overflowed. + */ + if (th->t_trans_id == ~0U) + flags |= FLUSH_ALL | COMMIT_NOW | WAIT; + flush = flags & FLUSH_ALL; + wait_on_commit = flags & WAIT; + + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth(sb, "journal end"); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); + } + + lock_journal(sb); + if (journal->j_next_full_flush) { + flags |= FLUSH_ALL; + flush = 1; + } + if (journal->j_next_async_flush) { + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; + } + + /* + * check_journal_end locks the journal, and unlocks if it does + * not return 1 it tells us if we should continue with the + * journal_end, or just return + */ + if (!check_journal_end(th, flags)) { + reiserfs_schedule_old_flush(sb); + wake_queued_writers(sb); + reiserfs_async_progress_wait(sb); + goto out; + } + + /* check_journal_end might set these, check again */ + if (journal->j_next_full_flush) { + flush = 1; + } + + /* + * j must wait means we have to flush the log blocks, and the + * real blocks for this transaction + */ + if (journal->j_must_wait > 0) { + flush = 1; + } +#ifdef REISERFS_PREALLOCATE + /* + * quota ops might need to nest, setup the journal_info pointer + * for them and raise the refcount so that it is > 0. + */ + current->journal_info = th; + th->t_refcount++; + + /* it should not involve new blocks into the transaction */ + reiserfs_discard_all_prealloc(th); + + th->t_refcount--; + current->journal_info = th->t_handle_save; +#endif + + /* setup description block */ + d_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + journal->j_start); + set_buffer_uptodate(d_bh); + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; + memset(d_bh->b_data, 0, d_bh->b_size); + memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); + set_desc_trans_id(desc, journal->j_trans_id); + + /* + * setup commit block. Don't write (keep it clean too) this one + * until after everyone else is written + */ + c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((journal->j_start + journal->j_len + + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + memset(c_bh->b_data, 0, c_bh->b_size); + set_commit_trans_id(commit, journal->j_trans_id); + set_buffer_uptodate(c_bh); + + /* init this journal list */ + jl = journal->j_current_jl; + + /* + * we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0); + jl->j_trans_id = journal->j_trans_id; + jl->j_timestamp = journal->j_trans_start_time; + jl->j_commit_bh = c_bh; + jl->j_start = journal->j_start; + jl->j_len = journal->j_len; + atomic_set(&jl->j_nonzerolen, journal->j_len); + atomic_set(&jl->j_commit_left, journal->j_len + 2); + jl->j_realblock = NULL; + + /* + * The ENTIRE FOR LOOP MUST not cause schedule to occur. + * for each real block, add it to the journal list hash, + * copy into real block index array in the commit or desc block + */ + trans_half = journal_trans_half(sb->s_blocksize); + for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { + if (buffer_journaled(cn->bh)) { + jl_cn = get_cnode(sb); + if (!jl_cn) { + reiserfs_panic(sb, "journal-1676", + "get_cnode returned NULL"); + } + if (i == 0) { + jl->j_realblock = jl_cn; + } + jl_cn->prev = last_cn; + jl_cn->next = NULL; + if (last_cn) { + last_cn->next = jl_cn; + } + last_cn = jl_cn; + /* + * make sure the block we are trying to log + * is not a block of journal or reserved area + */ + if (is_block_in_log_or_reserved_area + (sb, cn->bh->b_blocknr)) { + reiserfs_panic(sb, "journal-2332", + "Trying to log block %lu, " + "which is a log block", + cn->bh->b_blocknr); + } + jl_cn->blocknr = cn->bh->b_blocknr; + jl_cn->state = 0; + jl_cn->sb = sb; + jl_cn->bh = cn->bh; + jl_cn->jlist = jl; + insert_journal_hash(journal->j_list_hash_table, jl_cn); + if (i < trans_half) { + desc->j_realblock[i] = + cpu_to_le32(cn->bh->b_blocknr); + } else { + commit->j_realblock[i - trans_half] = + cpu_to_le32(cn->bh->b_blocknr); + } + } else { + i--; + } + } + set_desc_trans_len(desc, journal->j_len); + set_desc_mount_id(desc, journal->j_mount_id); + set_desc_trans_id(desc, journal->j_trans_id); + set_commit_trans_len(commit, journal->j_len); + + /* + * special check in case all buffers in the journal + * were marked for not logging + */ + BUG_ON(journal->j_len == 0); + + /* + * we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + + /* + * first data block is j_start + 1, so add one to + * cur_write_start wherever you use it + */ + cur_write_start = journal->j_start; + cn = journal->j_first; + jindex = 1; /* start at one so we don't get the desc again */ + while (cn) { + clear_buffer_journal_new(cn->bh); + /* copy all the real blocks into log area. dirty log blocks */ + if (buffer_journaled(cn->bh)) { + struct buffer_head *tmp_bh; + char *addr; + struct page *page; + tmp_bh = + journal_getblk(sb, + SB_ONDISK_JOURNAL_1st_BLOCK(sb) + + ((cur_write_start + + jindex) % + SB_ONDISK_JOURNAL_SIZE(sb))); + set_buffer_uptodate(tmp_bh); + page = cn->bh->b_page; + addr = kmap(page); + memcpy(tmp_bh->b_data, + addr + offset_in_page(cn->bh->b_data), + cn->bh->b_size); + kunmap(page); + mark_buffer_dirty(tmp_bh); + jindex++; + set_buffer_journal_dirty(cn->bh); + clear_buffer_journaled(cn->bh); + } else { + /* + * JDirty cleared sometime during transaction. + * don't log this one + */ + reiserfs_warning(sb, "journal-2048", + "BAD, buffer in journal hash, " + "but not JDirty!"); + brelse(cn->bh); + } + next = cn->next; + free_cnode(sb, cn); + cn = next; + reiserfs_cond_resched(sb); + } + + /* + * we are done with both the c_bh and d_bh, but + * c_bh must be written after all other commit blocks, + * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + journal->j_current_jl = alloc_journal_list(sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &journal->j_journal_list); + list_add_tail(&jl->j_working_list, &journal->j_working_list); + journal->j_num_work_lists++; + + /* reset journal values for the next transaction */ + old_start = journal->j_start; + journal->j_start = + (journal->j_start + journal->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(sb); + atomic_set(&journal->j_wcount, 0); + journal->j_bcount = 0; + journal->j_last = NULL; + journal->j_first = NULL; + journal->j_len = 0; + journal->j_trans_start_time = 0; + /* check for trans_id overflow */ + if (++journal->j_trans_id == 0) + journal->j_trans_id = 10; + journal->j_current_jl->j_trans_id = journal->j_trans_id; + journal->j_must_wait = 0; + journal->j_len_alloc = 0; + journal->j_next_full_flush = 0; + journal->j_next_async_flush = 0; + init_journal_hash(sb); + + /* + * make sure reiserfs_add_jh sees the new current_jl before we + * write out the tails + */ + smp_mb(); + + /* + * tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed + * and clean, if we crash before the later transaction commits, the + * data block is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + depth = reiserfs_write_unlock_nested(sb); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_tail_bh_list); + reiserfs_write_lock_nested(sb, depth); + } + BUG_ON(!list_empty(&jl->j_tail_bh_list)); + mutex_unlock(&jl->j_commit_mutex); + + /* + * honor the flush wishes from the caller, simple commits can + * be done outside the journal lock, they are done below + * + * if we don't flush the commit list right now, we put it into + * the work queue so the people waiting on the async progress work + * queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(sb, jl, 1); + flush_journal_list(sb, jl, 1); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { + /* + * Avoid queueing work when sb is being shut down. Transaction + * will be flushed on journal shutdown. + */ + if (sb->s_flags & MS_ACTIVE) + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); + } + + /* + * if the next transaction has any chance of wrapping, flush + * transactions that might get overwritten. If any journal lists + * are very old flush them as well. + */ +first_jl: + list_for_each_safe(entry, safe, &journal->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (journal->j_start <= temp_jl->j_start) { + if ((journal->j_start + journal->j_trans_max + 1) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else if ((journal->j_start + + journal->j_trans_max + 1) < + SB_ONDISK_JOURNAL_SIZE(sb)) { + /* + * if we don't cross into the next + * transaction and we don't wrap, there is + * no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((journal->j_start + + journal->j_trans_max + 1) > + SB_ONDISK_JOURNAL_SIZE(sb)) { + if (((journal->j_start + journal->j_trans_max + 1) % + SB_ONDISK_JOURNAL_SIZE(sb)) >= + temp_jl->j_start) { + flush_used_journal_lists(sb, temp_jl); + goto first_jl; + } else { + /* + * we don't overlap anything from out start + * to the end of the log, and our wrapped + * portion doesn't overlap anything at + * the start of the log. We can break + */ + break; + } + } + } + + journal->j_current_jl->j_list_bitmap = + get_list_bitmap(sb, journal->j_current_jl); + + if (!(journal->j_current_jl->j_list_bitmap)) { + reiserfs_panic(sb, "journal-1996", + "could not get a list bitmap"); + } + + atomic_set(&journal->j_jlock, 0); + unlock_journal(sb); + /* wake up any body waiting to join. */ + clear_bit(J_WRITERS_QUEUED, &journal->j_state); + wake_up(&journal->j_join_wait); + + if (!flush && wait_on_commit && + journal_list_still_alive(sb, commit_trans_id)) { + flush_commit_list(sb, jl, 1); + } +out: + reiserfs_check_lock_depth(sb, "journal end2"); + + memset(th, 0, sizeof(*th)); + /* + * Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later + */ + th->t_super = sb; + + return journal->j_errno; +} + +/* Send the file system read only and refuse new transactions */ +void reiserfs_abort_journal(struct super_block *sb, int errno) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + if (test_bit(J_ABORTED, &journal->j_state)) + return; + + if (!journal->j_errno) + journal->j_errno = errno; + + sb->s_flags |= MS_RDONLY; + set_bit(J_ABORTED, &journal->j_state); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif +} diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c new file mode 100644 index 000000000..249594a82 --- /dev/null +++ b/fs/reiserfs/lbalance.c @@ -0,0 +1,1427 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" +#include + +/* + * copy copy_count entries from source directory item to dest buffer + * (creating new item if needed) + */ +static void leaf_copy_dir_entries(struct buffer_info *dest_bi, + struct buffer_head *source, int last_first, + int item_num, int from, int copy_count) +{ + struct buffer_head *dest = dest_bi->bi_bh; + /* + * either the number of target item, or if we must create a + * new item, the number of the item we will create it next to + */ + int item_num_in_dest; + + struct item_head *ih; + struct reiserfs_de_head *deh; + int copy_records_len; /* length of all records in item to be copied */ + char *records; + + ih = item_head(source, item_num); + + RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); + + /* + * length of all record to be copied and first byte of + * the last of them + */ + deh = B_I_DEH(source, ih); + if (copy_count) { + copy_records_len = (from ? deh_location(&deh[from - 1]) : + ih_item_len(ih)) - + deh_location(&deh[from + copy_count - 1]); + records = + source->b_data + ih_location(ih) + + deh_location(&deh[from + copy_count - 1]); + } else { + copy_records_len = 0; + records = NULL; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = + (last_first == + LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) + - 1); + + /* + * if there are no items in dest or the first/last item in + * dest is not item of the same directory + */ + if ((item_num_in_dest == -1) || + (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST + && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, + leaf_key(dest, + item_num_in_dest)))) + { + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + put_ih_version(&new_ih, KEY_FORMAT_3_5); + /* calculate item len */ + put_ih_item_len(&new_ih, + DEH_SIZE * copy_count + copy_records_len); + put_ih_entry_count(&new_ih, 0); + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < ih_entry_count(ih)) { + set_le_ih_k_offset(&new_ih, + deh_offset(&deh[from])); + } else { + /* + * no entries will be copied to this + * item in this function + */ + set_le_ih_k_offset(&new_ih, U32_MAX); + /* + * this item is not yet valid, but we + * want I_IS_DIRECTORY_ITEM to return 1 + * for it, so we -1 + */ + } + set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, + TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf(dest_bi, + (last_first == + LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), + &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer(dest_bi, + (last_first == + FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - + 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, + records, 0); + } + + item_num_in_dest = + (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; + + leaf_paste_entries(dest_bi, item_num_in_dest, + (last_first == + FIRST_TO_LAST) ? ih_entry_count(item_head(dest, + item_num_in_dest)) + : 0, copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len); +} + +/* + * Copy the first (if last_first == FIRST_TO_LAST) or last + * (last_first == LAST_TO_FIRST) item or part of it or nothing + * (see the return 0 below) from SOURCE to the end (if last_first) + * or beginning (!last_first) of the DEST + */ +/* returns 1 if anything was copied, else 0 */ +static int leaf_copy_boundary_item(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int bytes_or_entries) +{ + struct buffer_head *dest = dest_bi->bi_bh; + /* number of items in the source and destination buffers */ + int dest_nr_item, src_nr_item; + struct item_head *ih; + struct item_head *dih; + + dest_nr_item = B_NR_ITEMS(dest); + + /* + * if ( DEST is empty or first item of SOURCE and last item of + * DEST are the items of different objects or of different types ) + * then there is no need to treat this item differently from the + * other items that we copy, so we return + */ + if (last_first == FIRST_TO_LAST) { + ih = item_head(src, 0); + dih = item_head(dest, dest_nr_item - 1); + + /* there is nothing to merge */ + if (!dest_nr_item + || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) + return 0; + + RFALSE(!ih_item_len(ih), + "vs-10010: item can not have empty length"); + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* copy all entries to dest */ + bytes_or_entries = ih_entry_count(ih); + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, + bytes_or_entries); + return 1; + } + + /* + * copy part of the body of the first item of SOURCE + * to the end of the body of the last item of the DEST + * part defined by 'bytes_or_entries'; if bytes_or_entries + * == -1 copy whole body; don't create new item header + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_item_len(ih); + +#ifdef CONFIG_REISERFS_CHECK + else { + if (bytes_or_entries == ih_item_len(ih) + && is_indirect_le_ih(ih)) + if (get_ih_free_space(ih)) + reiserfs_panic(sb_from_bi(dest_bi), + "vs-10020", + "last unformatted node " + "must be filled " + "entirely (%h)", ih); + } +#endif + + /* + * merge first item (or its part) of src buffer with the last + * item of dest buffer. Both are of the same file + */ + leaf_paste_in_buffer(dest_bi, + dest_nr_item - 1, ih_item_len(dih), + bytes_or_entries, ih_item_body(src, ih), 0); + + if (is_indirect_le_ih(dih)) { + RFALSE(get_ih_free_space(dih), + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); + if (bytes_or_entries == ih_item_len(ih)) + set_ih_free_space(dih, get_ih_free_space(ih)); + } + + return 1; + } + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* + * (DEST is empty or last item of SOURCE and first item of DEST + * are the items of different object or of different types) + */ + src_nr_item = B_NR_ITEMS(src); + ih = item_head(src, src_nr_item - 1); + dih = item_head(dest, 0); + + if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) + return 0; + + if (is_direntry_le_ih(ih)) { + /* + * bytes_or_entries = entries number in last + * item body of SOURCE + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_entry_count(ih); + + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + src_nr_item - 1, + ih_entry_count(ih) - bytes_or_entries, + bytes_or_entries); + return 1; + } + + /* + * copy part of the body of the last item of SOURCE to the + * begin of the body of the first item of the DEST; part defined + * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; + * change first item key of the DEST; don't create new item header + */ + + RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); + + if (bytes_or_entries == -1) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih_item_len(ih); + + RFALSE(le_ih_k_offset(dih) != + le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), + "vs-10050: items %h and %h do not match", ih, dih); + + /* change first item key of the DEST */ + set_le_ih_k_offset(dih, le_ih_k_offset(ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type(dih, le_ih_k_type(ih)); + } else { + /* merge to right only part of item */ + RFALSE(ih_item_len(ih) <= bytes_or_entries, + "vs-10060: no so much bytes %lu (needed %lu)", + (unsigned long)ih_item_len(ih), + (unsigned long)bytes_or_entries); + + /* change first item key of the DEST */ + if (is_direct_le_ih(dih)) { + RFALSE(le_ih_k_offset(dih) <= + (unsigned long)bytes_or_entries, + "vs-10070: dih %h, bytes_or_entries(%d)", dih, + bytes_or_entries); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + bytes_or_entries); + } else { + RFALSE(le_ih_k_offset(dih) <= + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, + "vs-10080: dih %h, bytes_or_entries(%d)", + dih, + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + ((bytes_or_entries / UNFM_P_SIZE) * + dest->b_size)); + } + } + + leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, + ih_item_body(src, + ih) + ih_item_len(ih) - bytes_or_entries, + 0); + return 1; +} + +/* + * copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning + * from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning + * from first-th item in src to head of dest + */ +static void leaf_copy_items_entirely(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int first, int cpy_num) +{ + struct buffer_head *dest; + int nr, free_space; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, + "vs-10090: bad last_first parameter %d", last_first); + RFALSE(B_NR_ITEMS(src) - first < cpy_num, + "vs-10100: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); + RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); + + dest = dest_bi->bi_bh; + + RFALSE(!dest, "vs-10130: can not copy negative amount of items"); + + if (cpy_num == 0) + return; + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* + * we will insert items before 0-th or nr-th item in dest buffer. + * It depends of last_first parameter + */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = item_head(dest, dest_before); + + RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, + "vs-10140: not enough free space for headers %d (needed %d)", + B_FREE_SPACE(dest), cpy_num * IH_SIZE); + + /* prepare space for headers */ + memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); + + /* copy item headers */ + memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); + + free_space -= (IH_SIZE * cpy_num); + set_blkh_free_space(blkh, free_space); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); + for (i = dest_before; i < nr + cpy_num; i++) { + location -= ih_item_len(ih + i - dest_before); + put_ih_location(ih + i - dest_before, location); + } + + /* prepare space for items */ + last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); + last_inserted_loc = ih_location(&ih[cpy_num - 1]); + + /* check free space */ + RFALSE(free_space < j - last_inserted_loc, + "vs-10150: not enough free space for items %d (needed %d)", + free_space, j - last_inserted_loc); + + memmove(dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy(dest->b_data + last_inserted_loc, + item_body(src, (first + cpy_num - 1)), + j - last_inserted_loc); + + /* sizes, item number */ + set_blkh_nr_item(blkh, nr + cpy_num); + set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); + + do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + RFALSE(dc_block_number(t_dc) != dest->b_blocknr, + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", + (long unsigned)dest->b_blocknr, + (long unsigned)dc_block_number(t_dc)); + put_dc_size(t_dc, + dc_size(t_dc) + (j - last_inserted_loc + + IH_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} + +/* + * This function splits the (liquid) item into two items (useful when + * shifting part of an item into another node.) + */ +static void leaf_item_bottle(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int item_num, int cpy_bytes) +{ + struct buffer_head *dest = dest_bi->bi_bh; + struct item_head *ih; + + RFALSE(cpy_bytes == -1, + "vs-10170: bytes == - 1 means: do not split item"); + + if (last_first == FIRST_TO_LAST) { + /* + * if ( if item in position item_num in buffer SOURCE + * is directory item ) + */ + ih = item_head(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, + item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the end of the DEST part defined by + * 'cpy_bytes'; create new item header; change old + * item_header (????); n_ih = new item_header; + */ + memcpy(&n_ih, ih, IH_SIZE); + put_ih_item_len(&n_ih, cpy_bytes); + if (is_indirect_le_ih(ih)) { + RFALSE(cpy_bytes == ih_item_len(ih) + && get_ih_free_space(ih), + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + (long unsigned)get_ih_free_space(ih)); + set_ih_free_space(&n_ih, 0); + } + + RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), + "vs-10190: bad mergeability of item %h", ih); + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, + item_body(src, item_num), 0); + } + } else { + /* + * if ( if item in position item_num in buffer + * SOURCE is directory item ) + */ + ih = item_head(src, item_num); + if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + item_num, + ih_entry_count(ih) - cpy_bytes, + cpy_bytes); + else { + struct item_head n_ih; + + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the begin of the DEST part defined by + * 'cpy_bytes'; create new item header; + * n_ih = new item_header; + */ + memcpy(&n_ih, ih, SHORT_KEY_SIZE); + + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; + + if (is_direct_le_ih(ih)) { + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + ih_item_len(ih) - cpy_bytes); + set_le_ih_k_type(&n_ih, TYPE_DIRECT); + set_ih_free_space(&n_ih, MAX_US_INT); + } else { + /* indirect item */ + RFALSE(!cpy_bytes && get_ih_free_space(ih), + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + (ih_item_len(ih) - + cpy_bytes) / UNFM_P_SIZE * + dest->b_size); + set_le_ih_k_type(&n_ih, TYPE_INDIRECT); + set_ih_free_space(&n_ih, get_ih_free_space(ih)); + } + + /* set item length */ + put_ih_item_len(&n_ih, cpy_bytes); + + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; + + leaf_insert_into_buf(dest_bi, 0, &n_ih, + item_body(src, item_num) + + ih_item_len(ih) - cpy_bytes, 0); + } + } +} + +/* + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE + * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole + * items from SOURCE to DEST. From last item copy cpy_num bytes for regular + * item and cpy_num directory entries for directory item. + */ +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, + int last_first, int cpy_num, int cpy_bytes) +{ + struct buffer_head *dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; + RFALSE(!dest || !src, "vs-10210: !dest || !src"); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + RFALSE(B_NR_ITEMS(src) < cpy_num, + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), + cpy_num); + RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); + + if (cpy_num == 0) + return 0; + + if (last_first == FIRST_TO_LAST) { + /* copy items to left */ + pos = 0; + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* + * copy the first item or it part or nothing to the end of + * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) + */ + i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if (cpy_num == 0) + return i; + pos += i; + if (cpy_bytes == -1) + /* + * copy first cpy_num items starting from position + * 'pos' of SOURCE to end of DEST + */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num); + else { + /* + * copy first cpy_num-1 items starting from position + * 'pos-1' of the SOURCE to the end of the DEST + */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num - 1); + + /* + * copy part of the item which number is + * cpy_num+pos-1 to the end of the DEST + */ + leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, + cpy_num + pos - 1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS(src); + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* + * copy the last item or it part or nothing to the + * begin of the DEST + * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); + */ + i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if (cpy_num == 0) + return i; + + pos = src_nr_item - cpy_num - i; + if (cpy_bytes == -1) { + /* + * starting from position 'pos' copy last cpy_num + * items of SOURCE to begin of DEST + */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos, cpy_num); + } else { + /* + * copy last cpy_num-1 items starting from position + * 'pos+1' of the SOURCE to the begin of the DEST; + */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos + 1, cpy_num - 1); + + /* + * copy part of the item which number is pos to + * the begin of the DEST + */ + leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, + cpy_bytes); + } + } + return i; +} + +/* + * there are types of coping: from S[0] to L[0], from S[0] to R[0], + * from R[0] to L[0]. for each of these we have to define parent and + * positions of destination and source buffers + */ +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *first_last, + struct buffer_head *Snew) +{ + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + + /* src->b_item_order */ + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = NULL; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic(sb_from_bi(src_bi), "vs-10250", + "shift type is unknown (%d)", shift_mode); + } + RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); +} + +/* + * copy mov_num items and mov_bytes of the (mov_num-1)th item to + * neighbor. Delete them from source + */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew) +{ + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; + + leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, + &first_last, Snew); + + ret_value = + leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, + mov_bytes); + + leaf_delete_items(&src_bi, first_last, + (first_last == + FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - + mov_num), mov_num, mov_bytes); + + return ret_value; +} + +/* + * Shift shift_num items (and shift_bytes of last shifted item if + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key + */ +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); + int i; + + /* + * move shift_num (and shift_bytes bytes) items from S[0] + * to left neighbor L[0] + */ + i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); + + if (shift_num) { + /* number of items in S[0] == 0 */ + if (B_NR_ITEMS(S0) == 0) { + + RFALSE(shift_bytes != -1, + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", + shift_bytes); +#ifdef CONFIG_REISERFS_CHECK + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb("vs-10275"); + reiserfs_panic(tb->tb_sb, "vs-10275", + "balance condition corrupted " + "(%c)", tb->tb_mode); + } +#endif + + if (PATH_H_POSITION(tb->tb_path, 1) == 0) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); + + RFALSE((shift_bytes != -1 && + !(is_direntry_le_ih(item_head(S0, 0)) + && !ih_entry_count(item_head(S0, 0)))) && + (!op_is_left_mergeable + (leaf_key(S0, 0), S0->b_size)), + "vs-10280: item must be mergeable"); + } + } + + return i; +} + +/* CLEANING STOPPED HERE */ + +/* + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, + * and replace the delimiting key + */ +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) +{ + int ret_value; + + /* + * move shift_num (and shift_bytes) items from S[0] to + * right neighbor R[0] + */ + ret_value = + leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); + + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + } + + return ret_value; +} + +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num); +/* + * If del_bytes == -1, starting from position 'first' delete del_num + * items in whole in buffer CUR. + * If not. + * If last_first == 0. Starting from position 'first' delete del_num-1 + * items in whole. Delete part of body of the first item. Part defined by + * del_bytes. Don't delete first item header + * If last_first == 1. Starting from position 'first+1' delete del_num-1 + * items in whole. Delete part of body of the last item . Part defined by + * del_bytes. Don't delete last item header. +*/ +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, + int first, int del_num, int del_bytes) +{ + struct buffer_head *bh; + int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); + + RFALSE(!bh, "10155: bh is not defined"); + RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", + del_num); + RFALSE(first < 0 + || first + del_num > item_amount, + "10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", first, + first + del_num, item_amount); + + if (del_num == 0) + return; + + if (first == 0 && del_num == item_amount && del_bytes == -1) { + make_empty_node(cur_bi); + do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); + return; + } + + if (del_bytes == -1) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num); + else { + if (last_first == FIRST_TO_LAST) { + /* + * delete del_num-1 items beginning from + * item in position first + */ + leaf_delete_items_entirely(cur_bi, first, del_num - 1); + + /* + * delete the part of the first item of the bh + * do not delete item header + */ + leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); + } else { + struct item_head *ih; + int len; + + /* + * delete del_num-1 items beginning from + * item in position first+1 + */ + leaf_delete_items_entirely(cur_bi, first + 1, + del_num - 1); + + ih = item_head(bh, B_NR_ITEMS(bh) - 1); + if (is_direntry_le_ih(ih)) + /* the last item is directory */ + /* + * len = numbers of directory entries + * in this item + */ + len = ih_entry_count(ih); + else + /* len = body len of item */ + len = ih_item_len(ih); + + /* + * delete the part of the last item of the bh + * do not delete item header + */ + leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, + len - del_bytes, del_bytes); + } + } +} + +/* insert item into the leaf node in position before */ +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + char *to; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, + "vs-10170: not enough free space in block %z, new item %h", + bh, inserted_item_ih); + RFALSE(zeros_number > ih_item_len(inserted_item_ih), + "vs-10172: zero number == %d, item length == %d", + zeros_number, ih_item_len(inserted_item_ih)); + + /* get item new item must be inserted before */ + ih = item_head(bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; + unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; + + memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); + memset(to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove(to, inserted_item_body, + ih_item_len(inserted_item_ih) - zeros_number); + else + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); + + /* insert item header */ + memmove(ih + 1, ih, IH_SIZE * (nr - before)); + memmove(ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i++) { + unmoved_loc -= ih_item_len(&ih[i - before]); + put_ih_location(&ih[i - before], unmoved_loc); + } + + /* sizes, free space, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, + free_space - (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_leaf_dirty(bi->tb, bh, 1); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * paste paste_size bytes to affected_item_num-th item. + * When item is a directory, this only prepare space for new entries + */ +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, + int pos_in_item, int paste_size, + const char *body, int zeros_number) +{ + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < paste_size, + "vs-10175: not enough free space: needed %d, available %d", + paste_size, free_space); + +#ifdef CONFIG_REISERFS_CHECK + if (zeros_number > paste_size) { + struct super_block *sb = NULL; + if (bi && bi->tb) + sb = bi->tb->tb_sb; + print_cur_tb("10177"); + reiserfs_panic(sb, "vs-10177", + "zeros_number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* item to be appended */ + ih = item_head(bh, affected_item_num); + + last_loc = ih_location(&ih[nr - affected_item_num - 1]); + unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; + + /* prepare space */ + memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + /* change locations */ + for (i = affected_item_num; i < nr; i++) + put_ih_location(&ih[i - affected_item_num], + ih_location(&ih[i - affected_item_num]) - + paste_size); + + if (body) { + if (!is_direntry_le_ih(ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove(bh->b_data + ih_location(ih) + + paste_size, + bh->b_data + ih_location(ih), + ih_item_len(ih)); + /* paste data in the head of item */ + memset(bh->b_data + ih_location(ih), 0, + zeros_number); + memcpy(bh->b_data + ih_location(ih) + + zeros_number, body, + paste_size - zeros_number); + } else { + memset(bh->b_data + unmoved_loc - paste_size, 0, + zeros_number); + memcpy(bh->b_data + unmoved_loc - paste_size + + zeros_number, body, + paste_size - zeros_number); + } + } + } else + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); + + put_ih_item_len(ih, ih_item_len(ih) + paste_size); + + /* change free space */ + set_blkh_free_space(blkh, free_space - paste_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + paste_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + * does not have free space, so it moves DEHs and remaining records as + * necessary. Return value is size of removed part of directory item + * in bytes. + */ +static int leaf_cut_entries(struct buffer_head *bh, + struct item_head *ih, int from, int del_count) +{ + char *item; + struct reiserfs_de_head *deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char *prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + /* + * make sure that item is directory and there are enough entries to + * remove + */ + RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); + RFALSE(ih_entry_count(ih) < from + del_count, + "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", + ih_entry_count(ih), from, del_count); + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* + * first byte of remaining entries, those are BEFORE cut entries + * (prev_record) and length of all removed records (cut_records_len) + */ + prev_record_offset = + (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); + cut_records_len = prev_record_offset /*from_record */ - + deh_location(&deh[from + del_count - 1]); + prev_record = item + prev_record_offset; + + /* adjust locations of remaining entries */ + for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) - + (DEH_SIZE * del_count)); + + for (i = 0; i < from; i++) + put_deh_location(&deh[i], + deh_location(&deh[i]) - (DEH_SIZE * del_count + + cut_records_len)); + + put_ih_entry_count(ih, ih_entry_count(ih) - del_count); + + /* shift entry head array and entries those are AFTER removed entries */ + memmove((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove(prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih_item_len(ih) - prev_record); + + return DEH_SIZE * del_count + cut_records_len; +} + +/* + * when cut item is part of regular file + * pos_in_item - first byte that must be cut + * cut_size - number of bytes to be cut beginning from pos_in_item + * + * when cut item is part of directory + * pos_in_item - number of first deleted entry + * cut_size - count of deleted entries + */ +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size) +{ + int nr; + struct buffer_head *bh = bi->bi_bh; + struct block_head *blkh; + struct item_head *ih; + int last_loc, unmoved_loc; + int i; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + /* item head of truncated item */ + ih = item_head(bh, cut_item_num); + + if (is_direntry_le_ih(ih)) { + /* first cut entry () */ + cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ + RFALSE(cut_item_num, + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", + cut_item_num); + /* change item key by key of first entry in the item */ + set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); + } + } else { + /* item is direct or indirect */ + RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); + RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + (long unsigned)pos_in_item, (long unsigned)cut_size, + (long unsigned)ih_item_len(ih)); + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove(bh->b_data + ih_location(ih), + bh->b_data + ih_location(ih) + cut_size, + ih_item_len(ih) - cut_size); + + /* change key of item */ + if (is_direct_le_ih(ih)) + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + cut_size); + else { + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (cut_size / UNFM_P_SIZE) * + bh->b_size); + RFALSE(ih_item_len(ih) == cut_size + && get_ih_free_space(ih), + "10205: invalid ih_free_space (%h)", ih); + } + } + } + + /* location of the last item */ + last_loc = ih_location(&ih[nr - cut_item_num - 1]); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; + + /* shift */ + memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ + put_ih_item_len(ih, ih_item_len(ih) - cut_size); + + if (is_indirect_le_ih(ih)) { + if (pos_in_item) + set_ih_free_space(ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i++) + put_ih_location(&ih[i - cut_item_num], + ih_location(&ih[i - cut_item_num]) + cut_size); + + /* size, free space */ + set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) - cut_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* delete del_num items from buffer starting from the first'th item */ +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num) +{ + struct buffer_head *bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(bh == NULL, "10210: buffer is 0"); + RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); + + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + RFALSE(first < 0 || first + del_num > nr, + "10220: first=%d, number=%d, there is %d items", first, del_num, + nr); + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node(bi); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + return; + } + + ih = item_head(bh, first); + + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : ih_location(ih - 1); + + /* delete items */ + last_loc = ih_location(&ih[nr - 1 - first]); + last_removed_loc = ih_location(&ih[del_num - 1]); + + memmove(bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i++) + put_ih_location(&ih[i - first], + ih_location(&ih[i - first]) + (j - + last_removed_loc)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + (j - last_removed_loc + + IH_SIZE * del_num)); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (j - last_removed_loc + + IH_SIZE * del_num)); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} + +/* + * paste new_entry_count entries (new_dehs, records) into position + * before to item_num-th item + */ +void leaf_paste_entries(struct buffer_info *bi, + int item_num, + int before, + int new_entry_count, + struct reiserfs_de_head *new_dehs, + const char *records, int paste_size) +{ + struct item_head *ih; + char *item; + struct reiserfs_de_head *deh; + char *insert_point; + int i, old_entry_num; + struct buffer_head *bh = bi->bi_bh; + + if (new_entry_count == 0) + return; + + ih = item_head(bh, item_num); + + /* + * make sure, that item is directory, and there are enough + * records in it + */ + RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); + RFALSE(ih_entry_count(ih) < before, + "10230: there are no entry we paste entries before. entry_count = %d, before = %d", + ih_entry_count(ih), before); + + /* first byte of dest item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* new records will be pasted at this point */ + insert_point = + item + + (before ? deh_location(&deh[before - 1]) + : (ih_item_len(ih) - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = ih_entry_count(ih) - 1; i >= before; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) + + (DEH_SIZE * new_entry_count)); + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i++) + put_deh_location(&deh[i], + deh_location(&deh[i]) + paste_size); + + old_entry_num = ih_entry_count(ih); + put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); + + /* prepare space for pasted records */ + memmove(insert_point + paste_size, insert_point, + item + (ih_item_len(ih) - paste_size) - insert_point); + + /* copy new records */ + memcpy(insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove((char *)(deh + new_entry_count), deh, + insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i++) { + put_deh_location(&deh[i], + deh_location(&deh[i]) + + (-deh_location + (&new_dehs[new_entry_count - 1]) + + insert_point + DEH_SIZE * new_entry_count - + item)); + } + + /* change item key if necessary (when we paste before 0-th entry */ + if (!before) { + set_le_ih_k_offset(ih, deh_offset(new_dehs)); + } +#ifdef CONFIG_REISERFS_CHECK + { + int prev, next; + /* check record locations */ + deh = B_I_DEH(bh, ih); + for (i = 0; i < ih_entry_count(ih); i++) { + next = + (i < + ih_entry_count(ih) - + 1) ? deh_location(&deh[i + 1]) : 0; + prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; + + if (prev && prev <= deh_location(&deh[i])) + reiserfs_error(sb_from_bi(bi), "vs-10240", + "directory item (%h) " + "corrupted (prev %a, " + "cur(%d) %a)", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh_location(&deh[i])) + reiserfs_error(sb_from_bi(bi), "vs-10250", + "directory item (%h) " + "corrupted (cur(%d) %a, " + "next %a)", + ih, i, deh + i, deh + i + 1); + } + } +#endif + +} diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c new file mode 100644 index 000000000..045b83ef9 --- /dev/null +++ b/fs/reiserfs/lock.c @@ -0,0 +1,100 @@ +#include "reiserfs.h" +#include + +/* + * The previous reiserfs locking scheme was heavily based on + * the tricky properties of the Bkl: + * + * - it was acquired recursively by a same task + * - the performances relied on the release-while-schedule() property + * + * Now that we replace it by a mutex, we still want to keep the same + * recursive property to avoid big changes in the code structure. + * We use our own lock_owner here because the owner field on a mutex + * is only available in SMP or mutex debugging, also we only need this field + * for this mutex, no need for a system wide mutex facility. + * + * Also this lock is often released before a call that could block because + * reiserfs performances were partially based on the release while schedule() + * property of the Bkl. + */ +void reiserfs_write_lock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + } + + /* No need to protect it, only the current task touches it */ + sb_i->lock_depth++; +} + +void reiserfs_write_unlock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* + * Are we unlocking without even holding the lock? + * Such a situation must raise a BUG() if we don't want + * to corrupt the data. + */ + BUG_ON(sb_i->lock_owner != current); + + if (--sb_i->lock_depth == -1) { + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + } +} + +int __must_check reiserfs_write_unlock_nested(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + int depth; + + /* this can happen when the lock isn't always held */ + if (sb_i->lock_owner != current) + return -1; + + depth = sb_i->lock_depth; + + sb_i->lock_depth = -1; + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + + return depth; +} + +void reiserfs_write_lock_nested(struct super_block *s, int depth) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* this can happen when the lock isn't always held */ + if (depth == -1) + return; + + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + sb_i->lock_depth = depth; +} + +/* + * Utility function to force a BUG if it is called without the superblock + * write lock held. caller is the string printed just before calling BUG() + */ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ON(sb_i->lock_depth < 0); +} + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c new file mode 100644 index 000000000..b55a07465 --- /dev/null +++ b/fs/reiserfs/namei.c @@ -0,0 +1,1659 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include + +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); + +/* + * directory item contains array of entry headers. This performs + * binary search through that array + */ +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) +{ + struct item_head *ih = de->de_ih; + struct reiserfs_de_head *deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = ih_entry_count(ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; + j = (rbound + lbound) / 2) { + if (off < deh_offset(deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset(deh + j)) { + lbound = j + 1; + continue; + } + /* this is not name found, but matched third key component */ + de->de_entry_num = j; + return NAME_FOUND; + } + + de->de_entry_num = lbound; + return NAME_NOT_FOUND; +} + +/* + * comment? maybe something like set de to point to what the path points to? + */ +static inline void set_de_item_location(struct reiserfs_dir_entry *de, + struct treepath *path) +{ + de->de_bh = get_last_bh(path); + de->de_ih = tp_item_head(path); + de->de_deh = B_I_DEH(de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION(path); +} + +/* + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set + */ +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); + de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen(de->de_name); +} + +/* what entry points to */ +static inline void set_de_object_key(struct reiserfs_dir_entry *de) +{ + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]); + de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]); +} + +static inline void store_de_entry_key(struct reiserfs_dir_entry *de) +{ + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); + + /* store key of the found entry */ + de->de_entry_key.version = KEY_FORMAT_3_5; + de->de_entry_key.on_disk_key.k_dir_id = + le32_to_cpu(de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = + le32_to_cpu(de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh)); + set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY); +} + +/* + * We assign a key to each directory item, and place multiple entries in a + * single directory item. A directory item has a key equal to the key of + * the first directory entry in it. + + * This function first calls search_by_key, then, if item whose first entry + * matches is not found it looks for the entry inside directory item found + * by search_by_key. Fills the path to the entry, and to the entry position + * in the item + */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *path, struct reiserfs_dir_entry *de) +{ + int retval; + + retval = search_item(sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION(path)) { + reiserfs_error(sb, "vs-7000", "search_by_key " + "returned item position == 0"); + pathrelse(path); + return IO_ERROR; + } + PATH_LAST_POSITION(path)--; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse(path); + reiserfs_error(sb, "vs-7002", "no path to here"); + return IO_ERROR; + } + + set_de_item_location(de, path); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih(de->de_ih) || + COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) { + print_block(de->de_bh, 0, -1, -1); + reiserfs_panic(sb, "vs-7005", "found item %h is not directory " + "item or does not belong to the same directory " + "as key %K", de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* + * binary search in directory item by third component of the + * key. sets de->de_entry_num of de + */ + retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + /* + * ugly, but rename needs de_bh, de_deh, de_name, + * de_namelen, de_objectid set + */ + set_de_name_and_namelen(de); + set_de_object_key(de); + } + return retval; +} + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ + +/* + * The third component is hashed, and you can choose from more than + * one hash function. Per directory hashes are not yet implemented + * but are thought about. This function should be moved to hashes.c + * Jedi, please do so. -Hans + */ +static __u32 get_third_component(struct super_block *s, + const char *name, int len) +{ + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = REISERFS_SB(s)->s_hash_function(name, len); + + /* take bits from 7-th to 30-th including both bounds */ + res = GET_HASH_VALUE(res); + if (res == 0) + /* + * needed to have no names before "." and ".." those have hash + * value == 0 and generation conters 1 and 2 accordingly + */ + res = 128; + return res + MAX_GENERATION_NUMBER; +} + +static int reiserfs_match(struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + int retval = NAME_NOT_FOUND; + + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = + (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : + NAME_FOUND_INVISIBLE); + + return retval; +} + +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ + +/* used when hash collisions exist */ + +static int linear_search_in_dir_item(struct cpu_key *key, + struct reiserfs_dir_entry *de, + const char *name, int namelen) +{ + struct reiserfs_de_head *deh = de->de_deh; + int retval; + int i; + + i = de->de_entry_num; + + if (i == ih_entry_count(de->de_ih) || + GET_HASH_VALUE(deh_offset(deh + i)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + i--; + } + + RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih), + "vs-7010: array of entry headers not found"); + + deh += i; + + for (; i >= 0; i--, deh--) { + /* hash value does not match, no need to check whole name */ + if (GET_HASH_VALUE(deh_offset(deh)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + return NAME_NOT_FOUND; + } + + /* mark that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), + de->de_gen_number_bit_string); + + /* calculate pointer to name and namelen */ + de->de_entry_num = i; + set_de_name_and_namelen(de); + + /* + * de's de_name, de_namelen, de_recordlen are set. + * Fill the rest. + */ + if ((retval = + reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { + + /* key of pointed object */ + set_de_object_key(de); + + store_de_entry_key(de); + + /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ + return retval; + } + } + + if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) + /* + * we have reached left most entry in the node. In common we + * have to go to the left neighbor, but if generation counter + * is 0 already, we know for sure, that there is no name with + * the same hash value + */ + /* + * FIXME: this work correctly only because hash value can not + * be 0. Btw, in case of Yura's hash it is probably possible, + * so, this is a bug + */ + return NAME_NOT_FOUND; + + RFALSE(de->de_item_num, + "vs-7015: two diritems of the same directory in one node?"); + + return GOTO_PREVIOUS_ITEM; +} + +/* + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND + * FIXME: should add something like IOERROR + */ +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, + struct treepath *path_to_entry, + struct reiserfs_dir_entry *de) +{ + struct cpu_key key_to_search; + int retval; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key(&key_to_search, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + while (1) { + retval = + search_by_entry_key(dir->i_sb, &key_to_search, + path_to_entry, de); + if (retval == IO_ERROR) { + reiserfs_error(dir->i_sb, "zam-7001", "io error"); + return IO_ERROR; + } + + /* compare names for all entries having given hash value */ + retval = + linear_search_in_dir_item(&key_to_search, de, name, + namelen); + /* + * there is no need to scan directory anymore. + * Given entry found or does not exist + */ + if (retval != GOTO_PREVIOUS_ITEM) { + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* + * there is left neighboring item of this directory + * and given entry can be there + */ + set_cpu_key_k_offset(&key_to_search, + le_ih_k_offset(de->de_ih) - 1); + pathrelse(path_to_entry); + + } /* while (1) */ +} + +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + + if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) + return ERR_PTR(-ENAMETOOLONG); + + reiserfs_write_lock(dir->i_sb); + + de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval == NAME_FOUND) { + inode = reiserfs_iget(dir->i_sb, + (struct cpu_key *)&de.de_dir_id); + if (!inode || IS_ERR(inode)) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-EACCES); + } + + /* + * Propagate the private flag so we know we're + * in the priv tree + */ + if (IS_PRIVATE(dir)) + inode->i_flags |= S_PRIVATE; + } + reiserfs_write_unlock(dir->i_sb); + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); + } + + return d_splice_alias(inode, dentry); +} + +/* + * looks up the dentry of the parent directory for child. + * taken from ext2_get_parent + */ +struct dentry *reiserfs_get_parent(struct dentry *child) +{ + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + struct inode *dir = d_inode(child); + + if (dir->i_nlink == 0) { + return ERR_PTR(-ENOENT); + } + de.de_gen_number_bit_string = NULL; + + reiserfs_write_lock(dir->i_sb); + retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); + } + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id); + reiserfs_write_unlock(dir->i_sb); + + return d_obtain_alias(inode); +} + +/* add entry to the directory (entry can be hidden). + +insert definition of when hidden directories are used here -Hans + + Does not mark dir inode dirty, do it after successesfull call to it */ + +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, + struct inode *dir, const char *name, int namelen, + struct inode *inode, int visible) +{ + struct cpu_key entry_key; + struct reiserfs_de_head *deh; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); + int gen_number; + + /* + * 48 bytes now and we avoid kmalloc if we + * create file with short name + */ + char small_buf[32 + DEH_SIZE]; + + char *buffer; + int buflen, paste_size; + int retval; + + BUG_ON(!th->t_trans_id); + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key(&entry_key, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP(namelen); + if (buflen > sizeof(small_buf)) { + buffer = kmalloc(buflen, GFP_NOFS); + if (!buffer) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = + (get_inode_sd_version(dir) == + STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; + + /* + * fill buffer : directory entry head, name[, dir objectid | , + * stat data | ,stat data, dir objectid ] + */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; /* JDM Endian safe if 0 */ + put_deh_offset(deh, cpu_key_k_offset(&entry_key)); + deh->deh_state = 0; /* JDM Endian safe if 0 */ + /* put key (ino analog) to de */ + + /* safe: k_dir_id is le */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; + /* safe: k_objectid is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; + + /* copy name */ + memcpy((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); + + /* + * entry is ready to be pasted into tree, set 'visibility' + * and 'stat data in entry' attributes + */ + mark_de_without_sd(deh); + visible ? mark_de_visible(deh) : mark_de_hidden(deh); + + /* find the proper place for the new entry */ + memset(bit_string, 0, sizeof(bit_string)); + de.de_gen_number_bit_string = bit_string; + retval = reiserfs_find_entry(dir, name, namelen, &path, &de); + if (retval != NAME_NOT_FOUND) { + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + + if (retval == IO_ERROR) { + return -EIO; + } + + if (retval != NAME_FOUND) { + reiserfs_error(dir->i_sb, "zam-7002", + "reiserfs_find_entry() returned " + "unexpected value (%d)", retval); + } + + return -EEXIST; + } + + gen_number = + find_first_zero_bit(bit_string, + MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning(dir->i_sb, "reiserfs-7010", + "Congratulations! we have got hash function " + "screwed up"); + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); + set_cpu_key_k_offset(&entry_key, deh_offset(deh)); + + /* update max-hash-collisions counter in reiserfs_sb_info */ + PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); + + /* we need to re-search for the insertion point */ + if (gen_number != 0) { + if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != + NAME_NOT_FOUND) { + reiserfs_warning(dir->i_sb, "vs-7032", + "entry with this key (%K) already " + "exists", &entry_key); + + if (buffer != small_buf) + kfree(buffer); + pathrelse(&path); + return -EBUSY; + } + } + + /* perform the insertion of the entry that we have prepared */ + retval = + reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, + paste_size); + if (buffer != small_buf) + kfree(buffer); + if (retval) { + reiserfs_check_path(&path); + return retval; + } + + dir->i_size += paste_size; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (!S_ISDIR(inode->i_mode) && visible) + /* reiserfs_mkdir or reiserfs_rename will do that by itself */ + reiserfs_update_sd(th, dir); + + reiserfs_check_path(&path); + return 0; +} + +/* + * quota utility function, call if you've had to abort after calling + * new_inode_init, and have not called reiserfs_new_inode yet. + * This should only be called on inodes that do not have stat data + * inserted into the tree yet. + */ +static int drop_new_inode(struct inode *inode) +{ + dquot_drop(inode); + make_bad_inode(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + return 0; +} + +/* + * utility function that does setup for reiserfs_new_inode. + * dquot_initialize needs lots of credits so it's better to have it + * outside of a transaction, so we had to pull some bits of + * reiserfs_new_inode out into this func. + */ +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) +{ + /* + * Make inode invalid - just in case we are going to drop it before + * the initialization happens + */ + INODE_PKEY(inode)->k_objectid = 0; + + /* + * the quota init calls have to know who to charge the quota to, so + * we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + return 0; +} + +static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + int retval; + struct inode *inode; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) + goto out_failed; + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode, &security); + if (retval) { + goto out_failed; + } + + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + /* FIXME: needed for block and char devices only */ + reiserfs_update_sd(&th, inode); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + + dquot_initialize(dir); + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * set flag that new packing locality created and new blocks + * for the content of that directory are not displaced yet + */ + REISERFS_I(dir)->new_packing_locality = 1; +#endif + mode = S_IFDIR | mode; + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + jbegin_count += reiserfs_cache_default_acl(dir); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + reiserfs_write_lock(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + /* + * inc the link count now, so another writer doesn't overflow + * it while we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); + if (retval) { + DEC_DIR_INODE_NLINK(dir) + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + /* note, _this_ add_entry will not update dir's stat data */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + clear_nlink(inode); + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + /* the above add_entry did not update dir's stat data */ + reiserfs_update_sd(&th, dir); + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); +out_failed: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static inline int reiserfs_empty_dir(struct inode *inode) +{ + /* + * we can cheat because an old format dir cannot have + * EMPTY_DIR_SIZE, and a new format dir cannot have + * EMPTY_DIR_SIZE_V1. So, if the inode is either size, + * regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0; + } + return 1; +} + +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + + /* + * we will be doing 2 balancings and update 2 stat data, we + * change quotas of the owner of the directory and of the owner + * of the parent directory. The quota structure is possibly + * deleted only on last iput => outside of this transaction + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_rmdir; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_rmdir; + } + + inode = d_inode(dentry); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + /* + * FIXME: compare key of an object and a key found in the entry + */ + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key, + dir, NULL, /* page */ + 0 /*new file size - not used here */ ); + if (retval < 0) + goto end_rmdir; + + if (inode->i_nlink != 2 && inode->i_nlink != 1) + reiserfs_error(inode->i_sb, "reiserfs-7040", + "empty directory has nlink != 2 (%d)", + inode->i_nlink); + + clear_nlink(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + reiserfs_update_sd(&th, dir); + + /* prevent empty directory from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th); + reiserfs_check_path(&path); +out_rmdir: + reiserfs_write_unlock(dir->i_sb); + return retval; + +end_rmdir: + /* + * we must release path, because we did not call + * reiserfs_cut_from_item, or reiserfs_cut_from_item does not + * release path if operation was not complete + */ + pathrelse(&path); + err = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; +} + +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval, err; + struct inode *inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path); + struct reiserfs_transaction_handle th; + int jbegin_count; + unsigned long savelink; + + dquot_initialize(dir); + + inode = d_inode(dentry); + + /* + * in this transaction we can be doing at max two balancings and + * update two stat datas, we change quotas of the owner of the + * directory and of the owner of the parent directory. The quota + * structure is possibly deleted only on iput => outside of + * this transaction + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_unlink; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_unlink; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + /* + * FIXME: compare key of an object and a key found in the entry + */ + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + reiserfs_warning(inode->i_sb, "reiserfs-7042", + "deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + + drop_nlink(inode); + + /* + * we schedule before doing the add_save_link call, save the link + * count so we don't race + */ + savelink = inode->i_nlink; + + retval = + reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL, + 0); + if (retval < 0) { + inc_nlink(inode); + goto end_unlink; + } + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, dir); + + if (!savelink) + /* prevent file from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th); + reiserfs_check_path(&path); + reiserfs_write_unlock(dir->i_sb); + return retval; + +end_unlink: + pathrelse(&path); + err = journal_end(&th); + reiserfs_check_path(&path); + if (err) + retval = err; +out_unlink: + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +static int reiserfs_symlink(struct inode *parent_dir, + struct dentry *dentry, const char *symname) +{ + int retval; + struct inode *inode; + char *name; + int item_len; + struct reiserfs_transaction_handle th; + struct reiserfs_security_handle security; + int mode = S_IFLNK | S_IRWXUGO; + /* + * We need blocks for transaction + (user+group)*(quotas for + * new inode + update of quota for directory owner) + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + + dquot_initialize(parent_dir); + + if (!(inode = new_inode(parent_dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, parent_dir, mode); + + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, + &security); + if (retval < 0) { + drop_new_inode(inode); + return retval; + } + jbegin_count += retval; + + reiserfs_write_lock(parent_dir->i_sb); + item_len = ROUND_UP(strlen(symname)); + if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { + retval = -ENAMETOOLONG; + drop_new_inode(inode); + goto out_failed; + } + + name = kmalloc(item_len, GFP_NOFS); + if (!name) { + drop_new_inode(inode); + retval = -ENOMEM; + goto out_failed; + } + memcpy(name, symname, strlen(symname)); + padd_item(name, item_len, strlen(symname)); + + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + kfree(name); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), + dentry, inode, &security); + kfree(name); + if (retval) { /* reiserfs_new_inode iputs for us */ + goto out_failed; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(parent_dir); + + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + drop_nlink(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th); + if (err) + retval = err; + unlock_new_inode(inode); + iput(inode); + goto out_failed; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); +out_failed: + reiserfs_write_unlock(parent_dir->i_sb); + return retval; +} + +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + struct inode *inode = d_inode(old_dentry); + struct reiserfs_transaction_handle th; + /* + * We need blocks for transaction + update of quotas for + * the owners of the directory + */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + dquot_initialize(dir); + + reiserfs_write_lock(dir->i_sb); + if (inode->i_nlink >= REISERFS_LINK_MAX) { + /* FIXME: sd_nlink is 32 bit for new files */ + reiserfs_write_unlock(dir->i_sb); + return -EMLINK; + } + + /* inc before scheduling so reiserfs_unlink knows we are here */ + inc_nlink(inode); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_nlink(inode); + reiserfs_write_unlock(dir->i_sb); + return retval; + } + + /* create new entry */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (retval) { + int err; + drop_nlink(inode); + err = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; + } + + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + ihold(inode); + d_instantiate(dentry, inode); + retval = journal_end(&th); + reiserfs_write_unlock(dir->i_sb); + return retval; +} + +/* de contains information pointing to an entry which */ +static int de_still_valid(const char *name, int len, + struct reiserfs_dir_entry *de) +{ + struct reiserfs_dir_entry tmp = *de; + + /* recalculate pointer to name and name length */ + set_de_name_and_namelen(&tmp); + /* FIXME: could check more */ + if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) + return 0; + return 1; +} + +static int entry_points_to_object(const char *name, int len, + struct reiserfs_dir_entry *de, + struct inode *inode) +{ + if (!de_still_valid(name, len, de)) + return 0; + + if (inode) { + if (!de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(inode->i_sb, "vs-7042", + "entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } + + /* this must be added hidden entry */ + if (de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(NULL, "vs-7043", "entry must be visible"); + + return 1; +} + +/* sets key of objectid the entry has to point to */ +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, + struct reiserfs_key *key) +{ + /* JDM These operations are endian safe - both are le */ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; +} + +/* + * process, that is going to call fix_nodes/do_balance must hold only + * one path. If it holds 2 or more, it can get into endless waiting in + * get_empty_nodes or its clones + */ +static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int retval; + INITIALIZE_PATH(old_entry_path); + INITIALIZE_PATH(new_entry_path); + INITIALIZE_PATH(dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode *old_inode, *new_dentry_inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + umode_t old_inode_mode; + unsigned long savelink = 1; + struct timespec ctime; + + /* + * three balancings: (1) old name removal, (2) new name insertion + * and (3) maybe "save" link insertion + * stat data updates: (1) old directory, + * (2) new directory and (3) maybe old object stat data (when it is + * directory) and (4) maybe stat data of object to which new entry + * pointed initially and (5) maybe block containing ".." of + * renamed directory + * quota updates: two parent directories + */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 5 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_inode = d_inode(old_dentry); + new_dentry_inode = d_inode(new_dentry); + + /* + * make sure that oldname still exists and points to an object we + * are going to rename + */ + old_de.de_gen_number_bit_string = NULL; + reiserfs_write_lock(old_dir->i_sb); + retval = + reiserfs_find_entry(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_entry_path, + &old_de); + pathrelse(&old_entry_path); + if (retval == IO_ERROR) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOENT; + } + + old_inode_mode = old_inode->i_mode; + if (S_ISDIR(old_inode_mode)) { + /* + * make sure that directory being renamed has correct ".." + * and that its new parent directory has not too many links + * already + */ + if (new_dentry_inode) { + if (!reiserfs_empty_dir(new_dentry_inode)) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOTEMPTY; + } + } + + /* + * directory is renamed, its parent directory will be changed, + * so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + &dot_dot_de); + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + } + + retval = journal_begin(&th, old_dir->i_sb, jbegin_count); + if (retval) { + reiserfs_write_unlock(old_dir->i_sb); + return retval; + } + + /* add new entry (or find the existing one) */ + retval = + reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, old_inode, 0); + if (retval == -EEXIST) { + if (!new_dentry_inode) { + reiserfs_panic(old_dir->i_sb, "vs-7050", + "new entry is found, new inode == 0"); + } + } else if (retval) { + int err = journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return err ? err : retval; + } + + reiserfs_update_inode_transaction(old_dir); + reiserfs_update_inode_transaction(new_dir); + + /* + * this makes it so an fsync on an open fd for the old name will + * commit the rename operation + */ + reiserfs_update_inode_transaction(old_inode); + + if (new_dentry_inode) + reiserfs_update_inode_transaction(new_dentry_inode); + + while (1) { + /* + * look for old name using corresponding entry key + * (found by reiserfs_find_entry) + */ + if ((retval = + search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, + &old_entry_path, + &old_de)) != NAME_FOUND) { + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); + + /* look for new name by reiserfs_find_entry */ + new_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, &new_entry_path, + &new_de); + /* + * reiserfs_add_entry should not return IO_ERROR, + * because it is called with essentially same parameters from + * reiserfs_add_entry above, and we'll catch any i/o errors + * before we get here. + */ + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); + + if (S_ISDIR(old_inode->i_mode)) { + if ((retval = + search_by_entry_key(new_dir->i_sb, + &dot_dot_de.de_entry_key, + &dot_dot_entry_path, + &dot_dot_de)) != NAME_FOUND) { + pathrelse(&dot_dot_entry_path); + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + copy_item_head(&dot_dot_ih, + tp_item_head(&dot_dot_entry_path)); + /* node containing ".." gets into transaction */ + reiserfs_prepare_for_journal(old_inode->i_sb, + dot_dot_de.de_bh, 1); + } + /* + * we should check seals here, not do + * this stuff, yes? Then, having + * gathered everything into RAM we + * should lock the buffers, yes? -Hans + */ + /* + * probably. our rename needs to hold more + * than one path at once. The seals would + * have to be written to deal with multi-path + * issues -chris + */ + /* + * sanity checking before doing the rename - avoid races many + * of the above checks could have scheduled. We have to be + * sure our items haven't been shifted by another process. + */ + if (item_moved(&new_entry_ih, &new_entry_path) || + !entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_dentry_inode) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object(old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer(old_inode->i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode->i_sb, + old_de.de_bh); + if (S_ISDIR(old_inode_mode)) + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + if (S_ISDIR(old_inode_mode)) { + if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || + !entry_points_to_object("..", 2, &dot_dot_de, + old_dir)) { + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + old_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + } + + RFALSE(S_ISDIR(old_inode_mode) && + !buffer_journal_prepared(dot_dot_de.de_bh), ""); + + break; + } + + /* + * ok, all the changes can be done in one fell swoop when we + * have claimed all the buffers needed. + */ + + mark_de_visible(new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); + journal_mark_dirty(&th, new_de.de_bh); + + mark_de_hidden(old_de.de_deh + old_de.de_entry_num); + journal_mark_dirty(&th, old_de.de_bh); + ctime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + /* + * thanks to Alex Adriaanse for patch + * which adds ctime update of renamed object + */ + old_inode->i_ctime = ctime; + + if (new_dentry_inode) { + /* adjust link number of the victim */ + if (S_ISDIR(new_dentry_inode->i_mode)) { + clear_nlink(new_dentry_inode); + } else { + drop_nlink(new_dentry_inode); + } + new_dentry_inode->i_ctime = ctime; + savelink = new_dentry_inode->i_nlink; + } + + if (S_ISDIR(old_inode_mode)) { + /* adjust ".." of renamed directory */ + set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); + journal_mark_dirty(&th, dot_dot_de.de_bh); + + /* + * there (in new_dir) was no directory, so it got new link + * (".." of renamed directory) + */ + if (!new_dentry_inode) + INC_DIR_INODE_NLINK(new_dir); + + /* old directory lost one link - ".. " of renamed directory */ + DEC_DIR_INODE_NLINK(old_dir); + } + /* + * looks like in 2.3.99pre3 brelse is atomic. + * so we can use pathrelse + */ + pathrelse(&new_entry_path); + pathrelse(&dot_dot_entry_path); + + /* + * FIXME: this reiserfs_cut_from_item's return value may screw up + * anybody, but it will panic if will not be able to find the + * entry. This needs one more clean up + */ + if (reiserfs_cut_from_item + (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL, + 0) < 0) + reiserfs_error(old_dir->i_sb, "vs-7060", + "couldn't not cut old name. Fsck later?"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + + reiserfs_update_sd(&th, old_dir); + reiserfs_update_sd(&th, new_dir); + reiserfs_update_sd(&th, old_inode); + + if (new_dentry_inode) { + if (savelink == 0) + add_save_link(&th, new_dentry_inode, + 0 /* not truncate */ ); + reiserfs_update_sd(&th, new_dentry_inode); + } + + retval = journal_end(&th); + reiserfs_write_unlock(old_dir->i_sb); + return retval; +} + +/* directories can handle most operations... */ +const struct inode_operations reiserfs_dir_inode_operations = { + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; + +/* + * symlink operations.. same as page_symlink_inode_operations, with xattr + * stuff added + */ +const struct inode_operations reiserfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, +}; + +/* + * special file operations.. just xattr/acl stuff + */ +const struct inode_operations reiserfs_special_inode_operations = { + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, +}; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c new file mode 100644 index 000000000..99a5d5dae --- /dev/null +++ b/fs/reiserfs/objectid.c @@ -0,0 +1,217 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include +#include "reiserfs.h" + +/* find where objectid map starts */ +#define objectid_map(s,rs) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ + (__le32 *)((rs) + 1)) + +#ifdef CONFIG_REISERFS_CHECK + +static void check_objectid_map(struct super_block *s, __le32 * map) +{ + if (le32_to_cpu(map[0]) != 1) + reiserfs_panic(s, "vs-15010", "map corrupted: %lx", + (long unsigned int)le32_to_cpu(map[0])); + + /* FIXME: add something else here */ +} + +#else +static void check_objectid_map(struct super_block *s, __le32 * map) +{; +} +#endif + +/* + * When we allocate objectids we allocate the first unused objectid. + * Each sequence of objectids in use (the odd sequences) is followed + * by a sequence of objectids not in use (the even sequences). We + * only need to record the last objectid in each of these sequences + * (both the odd and even sequences) in order to fully define the + * boundaries of the sequences. A consequence of allocating the first + * objectid not in use is that under most conditions this scheme is + * extremely compact. The exception is immediately after a sequence + * of operations which deletes a large number of objects of + * non-sequential objectids, and even then it will become compact + * again as soon as more objects are created. Note that many + * interesting optimizations of layout could result from complicating + * objectid assignment, but we have deferred making them for now. + */ + +/* get unique object identifier */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + __u32 unused_objectid; + + BUG_ON(!th->t_trans_id); + + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + /* comment needed -Hans */ + unused_objectid = le32_to_cpu(map[1]); + if (unused_objectid == U32_MAX) { + reiserfs_warning(s, "reiserfs-15100", "no more object ids"); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); + return 0; + } + + /* + * This incrementation allocates the first unused objectid. That + * is to say, the first entry on the objectid map is the first + * unused objectid, and by incrementing it we use it. See below + * where we check to see if we eliminated a sequence of unused + * objectids.... + */ + map[1] = cpu_to_le32(unused_objectid + 1); + + /* + * Now we check to see if we eliminated the last remaining member of + * the first even sequence (and can eliminate the sequence by + * eliminating its last objectid from oids), and can collapse the + * first two odd sequences into one sequence. If so, then the net + * result is to eliminate a pair of objectids from oids. We do this + * by shifting the entire map to the left. + */ + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { + memmove(map + 1, map + 3, + (sb_oid_cursize(rs) - 3) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + } + + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + return unused_objectid; +} + +/* makes object identifier unused */ +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) +{ + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + int i = 0; + + BUG_ON(!th->t_trans_id); + /*return; */ + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + /* + * start at the beginning of the objectid map (i = 0) and go to + * the end of it (i = disk_sb->s_oid_cursize). Linear search is + * what we use, though it is possible that binary search would be + * more efficient after performing lots of deletions (which is + * when oids is large.) We only check even i's. + */ + while (i < sb_oid_cursize(rs)) { + if (objectid_to_release == le32_to_cpu(map[i])) { + /* This incrementation unallocates the objectid. */ + le32_add_cpu(&map[i], 1); + + /* + * Did we unallocate the last member of an + * odd sequence, and can shrink oids? + */ + if (map[i] == map[i + 1]) { + /* shrink objectid map */ + memmove(map + i, map + i + 2, + (sb_oid_cursize(rs) - i - + 2) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + + RFALSE(sb_oid_cursize(rs) < 2 || + sb_oid_cursize(rs) > sb_oid_maxsize(rs), + "vs-15005: objectid map corrupted cur_size == %d (max == %d)", + sb_oid_cursize(rs), sb_oid_maxsize(rs)); + } + return; + } + + if (objectid_to_release > le32_to_cpu(map[i]) && + objectid_to_release < le32_to_cpu(map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { + le32_add_cpu(&map[i + 1], -1); + return; + } + + /* + * JDM comparing two little-endian values for + * equality -- safe + */ + /* + * objectid map must be expanded, but + * there is no space + */ + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { + PROC_INFO_INC(s, leaked_oid); + return; + } + + /* expand the objectid map */ + memmove(map + i + 3, map + i + 1, + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32(objectid_to_release); + map[i + 2] = cpu_to_le32(objectid_to_release + 1); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2); + return; + } + i += 2; + } + + reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)", + (long unsigned)objectid_to_release); +} + +int reiserfs_convert_objectid_map_v1(struct super_block *s) +{ + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s); + int cur_size = sb_oid_cursize(disk_sb); + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; + int old_max = sb_oid_maxsize(disk_sb); + struct reiserfs_super_block_v1 *disk_sb_v1; + __le32 *objectid_map, *new_objectid_map; + int i; + + disk_sb_v1 = + (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__le32 *) (disk_sb_v1 + 1); + new_objectid_map = (__le32 *) (disk_sb + 1); + + if (cur_size > new_size) { + /* + * mark everyone used that was listed as free at + * the end of the objectid map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1]; + set_sb_oid_cursize(disk_sb, new_size); + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1; i >= 0; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i]; + } + + /* set the max size so we don't overflow later */ + set_sb_oid_maxsize(disk_sb, new_size); + + /* Zero out label and generate random UUID */ + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)); + generate_random_uuid(disk_sb->s_uuid); + + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)); + return 0; +} diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c new file mode 100644 index 000000000..ae1dc841d --- /dev/null +++ b/fs/reiserfs/prints.c @@ -0,0 +1,777 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#include +#include +#include "reiserfs.h" +#include +#include + +#include + +static char error_buf[1024]; +static char fmt_buf[1024]; +static char off_buf[80]; + +static char *reiserfs_cpu_offset(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf(off_buf, "%llu(%llu)", + (unsigned long long) + GET_HASH_VALUE(cpu_key_k_offset(key)), + (unsigned long long) + GET_GENERATION_NUMBER(cpu_key_k_offset(key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)cpu_key_k_offset(key)); + return off_buf; +} + +static char *le_offset(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + sprintf(off_buf, "%llu(%llu)", + (unsigned long long) + GET_HASH_VALUE(le_key_k_offset(version, key)), + (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset(version, key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)le_key_k_offset(version, key)); + return off_buf; +} + +static char *cpu_type(struct cpu_key *key) +{ + if (cpu_key_k_type(key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type(key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type(key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +static char *le_type(struct reiserfs_key *key) +{ + int version; + + version = le_key_version(key); + + if (le_key_k_type(version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type(version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type(version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + +/* %k */ +static void sprintf_le_key(char *buf, struct reiserfs_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); + else + sprintf(buf, "[NULL]"); +} + +/* %K */ +static void sprintf_cpu_key(char *buf, struct cpu_key *key) +{ + if (key) + sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), + cpu_type(key)); + else + sprintf(buf, "[NULL]"); +} + +static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) +{ + if (deh) + sprintf(buf, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), + deh_location(deh), deh_state(deh)); + else + sprintf(buf, "[NULL]"); + +} + +static void sprintf_item_head(char *buf, struct item_head *ih) +{ + if (ih) { + strcpy(buf, + (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); + sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); + sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + } else + sprintf(buf, "[NULL]"); +} + +static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) +{ + char name[20]; + + memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); +} + +static void sprintf_block_head(char *buf, struct buffer_head *bh) +{ + sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); +} + +static void sprintf_buffer_head(char *buf, struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + sprintf(buf, + "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bdevname(bh->b_bdev, b), bh->b_size, + (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); +} + +static void sprintf_disk_child(char *buf, struct disk_child *dc) +{ + sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), + dc_size(dc)); +} + +static char *is_there_reiserfs_struct(char *fmt, int *what) +{ + char *k = fmt; + + while ((k = strchr(k, '%')) != NULL) { + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { + *what = k[1]; + break; + } + k++; + } + return k; +} + +/* + * debugging reiserfs we used to print out a lot of different + * variables, like keys, item headers, buffer heads etc. Values of + * most fields matter. So it took a long time just to write + * appropriative printk. With this reiserfs_warning you can use format + * specification for complex structures like you used to do with + * printfs for integers, doubles and pointers. For instance, to print + * out key structure you have to write just: + * reiserfs_warning ("bad key %k", key); + * instead of + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + * key->k_offset, key->k_uniqueness); + */ +static DEFINE_SPINLOCK(error_lock); +static void prepare_error_buf(const char *fmt, va_list args) +{ + char *fmt1 = fmt_buf; + char *k; + char *p = error_buf; + int what; + + spin_lock(&error_lock); + + strcpy(fmt1, fmt); + + while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { + *k = 0; + + p += vsprintf(p, fmt1, args); + + switch (what) { + case 'k': + sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + break; + case 'K': + sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + break; + case 'h': + sprintf_item_head(p, va_arg(args, struct item_head *)); + break; + case 't': + sprintf_direntry(p, + va_arg(args, + struct reiserfs_dir_entry *)); + break; + case 'y': + sprintf_disk_child(p, + va_arg(args, struct disk_child *)); + break; + case 'z': + sprintf_block_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'b': + sprintf_buffer_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'a': + sprintf_de_head(p, + va_arg(args, + struct reiserfs_de_head *)); + break; + } + + p += strlen(p); + fmt1 = k + 2; + } + vsprintf(p, fmt1, args); + spin_unlock(&error_lock); + +} + +/* + * in addition to usual conversion specifiers this accepts reiserfs + * specific conversion specifiers: + * %k to print little endian key, + * %K to print cpu key, + * %h to print item_head, + * %t to print directory entry + * %z to print block head (arg must be struct buffer_head * + * %b to print buffer_head + */ + +#define do_reiserfs_warning(fmt)\ +{\ + va_list args;\ + va_start( args, fmt );\ + prepare_error_buf( fmt, args );\ + va_end( args );\ +} + +void __reiserfs_warning(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: " + "%s\n", sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); +} + +/* No newline.. reiserfs_info calls can be followed by printk's */ +void reiserfs_info(struct super_block *sb, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_NOTICE "REISERFS (device %s): %s", + sb->s_id, error_buf); + else + printk(KERN_NOTICE "REISERFS %s:", error_buf); +} + +/* No newline.. reiserfs_printk calls can be followed by printk's */ +static void reiserfs_printk(const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + printk(error_buf); +} + +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) +{ +#ifdef CONFIG_REISERFS_CHECK + do_reiserfs_warning(fmt); + if (s) + printk(KERN_DEBUG "REISERFS debug (device %s): %s\n", + s->s_id, error_buf); + else + printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf); +#endif +} + +/* + * The format: + * + * maintainer-errorid: [function-name:] message + * + * where errorid is unique to the maintainer and function-name is + * optional, is recommended, so that anyone can easily find the bug + * with a simple grep for the short to type string + * maintainer-errorid. Don't bother with reusing errorids, there are + * lots of numbers out there. + * + * Example: + * + * reiserfs_panic( + * p_sb, "reiser-29: reiserfs_new_blocknrs: " + * "one of search_start or rn(%d) is equal to MAX_B_NUM," + * "which means that we are optimizing location based on the " + * "bogus location of a temp buffer (%p).", + * rn, bh + * ); + * + * Regular panic()s sometimes clear the screen before the message can + * be read, thus the need for the while loop. + * + * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely + * ignores this scheme, and considers it pointless complexity): + * + * panics in reiserfs_fs.h have numbers from 1000 to 1999 + * super.c 2000 to 2999 + * preserve.c (unused) 3000 to 3999 + * bitmap.c 4000 to 4999 + * stree.c 5000 to 5999 + * prints.c 6000 to 6999 + * namei.c 7000 to 7999 + * fix_nodes.c 8000 to 8999 + * dir.c 9000 to 9999 + * lbalance.c 10000 to 10999 + * ibalance.c 11000 to 11999 not ready + * do_balan.c 12000 to 12999 + * inode.c 13000 to 13999 + * file.c 14000 to 14999 + * objectid.c 15000 - 15999 + * buffer.c 16000 - 16999 + * symlink.c 17000 - 17999 + * + * . */ + +void __reiserfs_panic(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + +#ifdef CONFIG_REISERFS_CHECK + dump_stack(); +#endif + if (sb) + printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", + sb->s_id, id ? id : "", id ? " " : "", + function, error_buf); + else + printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", + id ? id : "", id ? " " : "", function, error_buf); + BUG(); +} + +void __reiserfs_error(struct super_block *sb, const char *id, + const char *function, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + BUG_ON(sb == NULL); + + if (reiserfs_error_panic(sb)) + __reiserfs_panic(sb, id, function, error_buf); + + if (id && id[0]) + printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n", + sb->s_id, id, function, error_buf); + else + printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + if (sb->s_flags & MS_RDONLY) + return; + + reiserfs_info(sb, "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, -EIO); +} + +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) +{ + do_reiserfs_warning(fmt); + + if (reiserfs_error_panic(sb)) { + panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id, + error_buf); + } + + if (reiserfs_is_journal_aborted(SB_JOURNAL(sb))) + return; + + printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, + error_buf); + + sb->s_flags |= MS_RDONLY; + reiserfs_abort_journal(sb, errno); +} + +/* + * this prints internal nodes (4 keys/items in line) (dc_number, + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + * dc_size)... + */ +static int print_internal(struct buffer_head *bh, int first, int last) +{ + struct reiserfs_key *key; + struct disk_child *dc; + int i; + int from, to; + + if (!B_IS_KEYS_LEVEL(bh)) + return 1; + + check_internal(bh); + + if (first == -1) { + from = 0; + to = B_NR_ITEMS(bh); + } else { + from = first; + to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + } + + reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + dc = B_N_CHILD(bh, from); + reiserfs_printk("PTR %d: %y ", from, dc); + + for (i = from, key = internal_key(bh, from), dc++; i < to; + i++, key++, dc++) { + reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk("\n"); + } + printk("\n"); + return 0; +} + +static int print_leaf(struct buffer_head *bh, int print_mode, int first, + int last) +{ + struct block_head *blkh; + struct item_head *ih; + int i, nr; + int from, to; + + if (!B_IS_ITEMS_LEVEL(bh)) + return 1; + + check_leaf(bh); + + blkh = B_BLK_HEAD(bh); + ih = item_head(bh, 0); + nr = blkh_nr_item(blkh); + + printk + ("\n===================================================================\n"); + reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + nr - 1)->ih_key)); + return 0; + } + + if (first < 0 || first > nr - 1) + from = 0; + else + from = first; + + if (last < 0 || last > nr) + to = nr; + else + to = last; + + ih += from; + printk + ("-------------------------------------------------------------------------------\n"); + printk + ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih++) { + printk + ("-------------------------------------------------------------------------------\n"); + reiserfs_printk("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item(ih, ih_item_body(bh, ih)); + } + + printk + ("===================================================================\n"); + + return 0; +} + +char *reiserfs_hashname(int code) +{ + if (code == YURA_HASH) + return "rupasov"; + if (code == TEA_HASH) + return "tea"; + if (code == R5_HASH) + return "r5"; + + return "unknown"; +} + +/* return 1 if this is not super block */ +static int print_super_block(struct buffer_head *bh) +{ + struct reiserfs_super_block *rs = + (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + char *version; + char b[BDEVNAME_SIZE]; + + if (is_reiserfs_3_5(rs)) { + version = "3.5"; + } else if (is_reiserfs_3_6(rs)) { + version = "3.6"; + } else if (is_reiserfs_jr(rs)) { + version = ((sb_version(rs) == REISERFS_VERSION_2) ? + "3.6" : "3.5"); + } else { + return 1; + } + + printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); + printk("Reiserfs version %s\n", version); + printk("Block count %u\n", sb_block_count(rs)); + printk("Blocksize %d\n", sb_blocksize(rs)); + printk("Free blocks %u\n", sb_free_blocks(rs)); + /* + * FIXME: this would be confusing if + * someone stores reiserfs super block in some data block ;) +// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + */ + skipped = bh->b_blocknr; + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); + printk + ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" + "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), + (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : + sb_reserved_for_journal(rs)), data_blocks); + printk("Root block %u\n", sb_root_block(rs)); + printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); + printk("Journal dev %d\n", sb_jp_journal_dev(rs)); + printk("Journal orig size %d\n", sb_jp_journal_size(rs)); + printk("FS state %d\n", sb_fs_state(rs)); + printk("Hash function \"%s\"\n", + reiserfs_hashname(sb_hash_function_code(rs))); + + printk("Tree height %d\n", sb_tree_height(rs)); + return 0; +} + +static int print_desc_block(struct buffer_head *bh) +{ + struct reiserfs_journal_desc *desc; + + if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) + return 1; + + desc = (struct reiserfs_journal_desc *)(bh->b_data); + printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", + (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), + get_desc_mount_id(desc), get_desc_trans_len(desc)); + + return 0; +} +/* ..., int print_mode, int first, int last) */ +void print_block(struct buffer_head *bh, ...) +{ + va_list args; + int mode, first, last; + + if (!bh) { + printk("print_block: buffer is NULL\n"); + return; + } + + va_start(args, bh); + + mode = va_arg(args, int); + first = va_arg(args, int); + last = va_arg(args, int); + if (print_leaf(bh, mode, first, last)) + if (print_internal(bh, first, last)) + if (print_super_block(bh)) + if (print_desc_block(bh)) + printk + ("Block %llu contains unformatted data\n", + (unsigned long long)bh->b_blocknr); + + va_end(args); +} + +static char print_tb_buf[2048]; + +/* this stores initial state of tree balance in the print_tb_buf */ +void store_print_tb(struct tree_balance *tb) +{ + int h = 0; + int i; + struct buffer_head *tbSh, *tbFh; + + if (!tb) + return; + + sprintf(print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + REISERFS_SB(tb->tb_sb)->s_do_balance, + tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item); + + for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) { + if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= + tb->tb_path->path_length + && PATH_H_PATH_OFFSET(tb->tb_path, + h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER(tb->tb_path, h); + tbFh = PATH_H_PPARENT(tb->tb_path, h); + } else { + tbSh = NULL; + tbFh = NULL; + } + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", + h, + (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), + (tbSh) ? atomic_read(&tbSh->b_count) : -1, + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), + (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), + (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, + (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), + (tb->FL[h]) ? (long long)(tb->FL[h]-> + b_blocknr) : (-1LL), + (tb->FR[h]) ? (long long)(tb->FR[h]-> + b_blocknr) : (-1LL), + (tb->CFL[h]) ? (long long)(tb->CFL[h]-> + b_blocknr) : (-1LL), + (tb->CFR[h]) ? (long long)(tb->CFR[h]-> + b_blocknr) : (-1LL)); + } + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], + tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], + tb->sbytes[0], tb->snum[1], tb->sbytes[1], + tb->cur_blknum, tb->lkey[0], tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], + tb->blknum[h]); + } while (tb->insert_size[h]); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "FEB list: "); + + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < ARRAY_SIZE(tb->FEB); i++) + sprintf(print_tb_buf + strlen(print_tb_buf), + "%p (%llu %d)%s", tb->FEB[i], + tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> + b_blocknr : 0ULL, + tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, + (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); + + sprintf(print_tb_buf + strlen(print_tb_buf), + "======================== the end ====================================\n"); +} + +void print_cur_tb(char *mes) +{ + printk("%s\n%s", mes, print_tb_buf); +} + +static void check_leaf_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6010", "invalid item number %z", + bh); + if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) + reiserfs_panic(NULL, "vs-6020", "invalid free space %z", + bh); + +} + +static void check_internal_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + + blkh = B_BLK_HEAD(bh); + if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) + reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); + + if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh); + + if (B_FREE_SPACE(bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - + DC_SIZE * (B_NR_ITEMS(bh) + 1)) + reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh); + +} + +void check_leaf(struct buffer_head *bh) +{ + int i; + struct item_head *ih; + + if (!bh) + return; + check_leaf_block_head(bh); + for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, ih_item_body(bh, ih)); +} + +void check_internal(struct buffer_head *bh) +{ + if (!bh) + return; + check_internal_block_head(bh); +} + +void print_statistics(struct super_block *s) +{ + + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ + bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); + */ + +} diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c new file mode 100644 index 000000000..621b9f381 --- /dev/null +++ b/fs/reiserfs/procfs.c @@ -0,0 +1,508 @@ +/* -*- linux-c -*- */ + +/* fs/reiserfs/procfs.c */ + +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* proc info support a la one created by Sizif@Botik.RU for PGC */ + +#include +#include +#include +#include +#include "reiserfs.h" +#include +#include + +/* + * LOCKING: + * + * These guys are evicted from procfs as the very first step in ->kill_sb(). + * + */ + +static int show_version(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + char *format; + + if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { + format = "3.6"; + } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { + format = "3.5"; + } else { + format = "unknown"; + } + + seq_printf(m, "%s format\twith checks %s\n", format, +#if defined( CONFIG_REISERFS_CHECK ) + "on" +#else + "off" +#endif + ); + return 0; +} + +#define SF( x ) ( r -> x ) +#define SFP( x ) SF( s_proc_info_data.x ) +#define SFPL( x ) SFP( x[ level ] ) +#define SFPF( x ) SFP( scan_bitmap.x ) +#define SFPJ( x ) SFP( journal.x ) + +#define D2C( x ) le16_to_cpu( x ) +#define D4C( x ) le32_to_cpu( x ) +#define DF( x ) D2C( rs -> s_v1.x ) +#define DFL( x ) D4C( rs -> s_v1.x ) + +#define objectid_map( s, rs ) (old_format_only (s) ? \ + (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ + (__le32 *)(rs + 1)) +#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) + +#define DJF( x ) le32_to_cpu( rs -> x ) +#define DJV( x ) le32_to_cpu( s_v1 -> x ) +#define DJP( x ) le32_to_cpu( jp -> x ) +#define JF( x ) ( r -> s_journal -> x ) + +static int show_super(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "state: \t%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" + "gen. counter: \t%i\n" + "s_disk_reads: \t%i\n" + "s_disk_writes: \t%i\n" + "s_fix_nodes: \t%i\n" + "s_do_balance: \t%i\n" + "s_unneeded_left_neighbor: \t%i\n" + "s_good_search_by_key_reada: \t%i\n" + "s_bmaps: \t%i\n" + "s_bmaps_without_search: \t%i\n" + "s_direct2indirect: \t%i\n" + "s_indirect2direct: \t%i\n" + "\n" + "max_hash_collisions: \t%i\n" + "breads: \t%lu\n" + "bread_misses: \t%lu\n" + "search_by_key: \t%lu\n" + "search_by_key_fs_changed: \t%lu\n" + "search_by_key_restarted: \t%lu\n" + "insert_item_restarted: \t%lu\n" + "paste_into_item_restarted: \t%lu\n" + "cut_from_item_restarted: \t%lu\n" + "delete_solid_item_restarted: \t%lu\n" + "delete_item_restarted: \t%lu\n" + "leaked_oid: \t%lu\n" + "leaves_removable: \t%lu\n", + SF(s_mount_state) == REISERFS_VALID_FS ? + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", + reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", + reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", + reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", + reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", + reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", + reiserfs_no_unhashed_relocation(sb) ? + "NO_UNHASHED_RELOCATION " : "", + reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", + reiserfs_test4(sb) ? "TEST4 " : "", + have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? + "SMALL_TAILS " : "NO_TAILS ", + replay_only(sb) ? "REPLAY_ONLY " : "", + convert_reiserfs(sb) ? "CONV " : "", + atomic_read(&r->s_generation_counter), + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), + SF(s_do_balance), SF(s_unneeded_left_neighbor), + SF(s_good_search_by_key_reada), SF(s_bmaps), + SF(s_bmaps_without_search), SF(s_direct2indirect), + SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), + SFP(bread_miss), SFP(search_by_key), + SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), + SFP(insert_item_restarted), SFP(paste_into_item_restarted), + SFP(cut_from_item_restarted), + SFP(delete_solid_item_restarted), SFP(delete_item_restarted), + SFP(leaked_oid), SFP(leaves_removable)); + + return 0; +} + +static int show_per_level(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + int level; + + seq_printf(m, "level\t" + " balances" + " [sbk: reads" + " fs_changed" + " restarted]" + " free space" + " items" + " can_remove" + " lnum" + " rnum" + " lbytes" + " rbytes" + " get_neig" + " get_neig_res" " need_l_neig" " need_r_neig" "\n"); + + for (level = 0; level < MAX_HEIGHT; ++level) { + seq_printf(m, "%i\t" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12li" + " %12li" + " %12li" + " %12li" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + "\n", + level, + SFPL(balance_at), + SFPL(sbk_read_at), + SFPL(sbk_fs_changed), + SFPL(sbk_restarted), + SFPL(free_at), + SFPL(items_at), + SFPL(can_node_be_removed), + SFPL(lnum), + SFPL(rnum), + SFPL(lbytes), + SFPL(rbytes), + SFPL(get_neighbors), + SFPL(get_neighbors_restart), + SFPL(need_l_neighbor), SFPL(need_r_neighbor) + ); + } + return 0; +} + +static int show_bitmap(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + + seq_printf(m, "free_block: %lu\n" + " scan_bitmap:" + " wait" + " bmap" + " retry" + " stolen" + " journal_hint" + "journal_nohint" + "\n" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + "\n", + SFP(free_block), + SFPF(call), + SFPF(wait), + SFPF(bmap), + SFPF(retry), + SFPF(stolen), + SFPF(in_journal_hint), SFPF(in_journal_nohint)); + + return 0; +} + +static int show_on_disk_super(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + int hash_code = DFL(s_hash_function_code); + __u32 flags = DJF(s_flags); + + seq_printf(m, "block_count: \t%i\n" + "free_blocks: \t%i\n" + "root_block: \t%i\n" + "blocksize: \t%i\n" + "oid_maxsize: \t%i\n" + "oid_cursize: \t%i\n" + "umount_state: \t%i\n" + "magic: \t%10.10s\n" + "fs_state: \t%i\n" + "hash: \t%s\n" + "tree_height: \t%i\n" + "bmap_nr: \t%i\n" + "version: \t%i\n" + "flags: \t%x[%s]\n" + "reserved_for_journal: \t%i\n", + DFL(s_block_count), + DFL(s_free_blocks), + DFL(s_root_block), + DF(s_blocksize), + DF(s_oid_maxsize), + DF(s_oid_cursize), + DF(s_umount_state), + rs->s_v1.s_magic, + DF(s_fs_state), + hash_code == TEA_HASH ? "tea" : + (hash_code == YURA_HASH) ? "rupasov" : + (hash_code == R5_HASH) ? "r5" : + (hash_code == UNSET_HASH) ? "unset" : "unknown", + DF(s_tree_height), + DF(s_bmap_nr), + DF(s_version), flags, (flags & reiserfs_attrs_cleared) + ? "attrs_cleared" : "", DF(s_reserved_for_journal)); + + return 0; +} + +static int show_oidmap(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); + struct reiserfs_super_block *rs = sb_info->s_rs; + unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); + unsigned long total_used = 0; + int i; + + for (i = 0; i < mapsize; ++i) { + __u32 right; + + right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); + seq_printf(m, "%s: [ %x .. %x )\n", + (i & 1) ? "free" : "used", MAP(i), right); + if (!(i & 1)) { + total_used += right - MAP(i); + } + } +#if defined( REISERFS_USE_OIDMAPF ) + if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { + loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; + total_used += size / sizeof(reiserfs_oidinterval_d_t); + } +#endif + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", + mapsize, + mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); + return 0; +} + +static int show_journal(struct seq_file *m, void *unused) +{ + struct super_block *sb = m->private; + struct reiserfs_sb_info *r = REISERFS_SB(sb); + struct reiserfs_super_block *rs = r->s_rs; + struct journal_params *jp = &rs->s_v1.s_journal; + char b[BDEVNAME_SIZE]; + + seq_printf(m, /* on-disk fields */ + "jp_journal_1st_block: \t%i\n" + "jp_journal_dev: \t%s[%x]\n" + "jp_journal_size: \t%i\n" + "jp_journal_trans_max: \t%i\n" + "jp_journal_magic: \t%i\n" + "jp_journal_max_batch: \t%i\n" + "jp_journal_max_commit_age: \t%i\n" + "jp_journal_max_trans_age: \t%i\n" + /* incore fields */ + "j_1st_reserved_block: \t%i\n" + "j_state: \t%li\n" + "j_trans_id: \t%u\n" + "j_mount_id: \t%lu\n" + "j_start: \t%lu\n" + "j_len: \t%lu\n" + "j_len_alloc: \t%lu\n" + "j_wcount: \t%i\n" + "j_bcount: \t%lu\n" + "j_first_unflushed_offset: \t%lu\n" + "j_last_flush_trans_id: \t%u\n" + "j_trans_start_time: \t%li\n" + "j_list_bitmap_index: \t%i\n" + "j_must_wait: \t%i\n" + "j_next_full_flush: \t%i\n" + "j_next_async_flush: \t%i\n" + "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" + /* reiserfs_proc_info_data_t.journal fields */ + "in_journal: \t%12lu\n" + "in_journal_bitmap: \t%12lu\n" + "in_journal_reusable: \t%12lu\n" + "lock_journal: \t%12lu\n" + "lock_journal_wait: \t%12lu\n" + "journal_begin: \t%12lu\n" + "journal_relock_writers: \t%12lu\n" + "journal_relock_wcount: \t%12lu\n" + "mark_dirty: \t%12lu\n" + "mark_dirty_already: \t%12lu\n" + "mark_dirty_notjournal: \t%12lu\n" + "restore_prepared: \t%12lu\n" + "prepare: \t%12lu\n" + "prepare_retry: \t%12lu\n", + DJP(jp_journal_1st_block), + bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + DJP(jp_journal_dev), + DJP(jp_journal_size), + DJP(jp_journal_trans_max), + DJP(jp_journal_magic), + DJP(jp_journal_max_batch), + SB_JOURNAL(sb)->j_max_commit_age, + DJP(jp_journal_max_trans_age), + JF(j_1st_reserved_block), + JF(j_state), + JF(j_trans_id), + JF(j_mount_id), + JF(j_start), + JF(j_len), + JF(j_len_alloc), + atomic_read(&r->s_journal->j_wcount), + JF(j_bcount), + JF(j_first_unflushed_offset), + JF(j_last_flush_trans_id), + JF(j_trans_start_time), + JF(j_list_bitmap_index), + JF(j_must_wait), + JF(j_next_full_flush), + JF(j_next_async_flush), + JF(j_cnode_used), + JF(j_cnode_free), + SFPJ(in_journal), + SFPJ(in_journal_bitmap), + SFPJ(in_journal_reusable), + SFPJ(lock_journal), + SFPJ(lock_journal_wait), + SFPJ(journal_being), + SFPJ(journal_relock_writers), + SFPJ(journal_relock_wcount), + SFPJ(mark_dirty), + SFPJ(mark_dirty_already), + SFPJ(mark_dirty_notjournal), + SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) + ); + return 0; +} + +static int r_open(struct inode *inode, struct file *file) +{ + return single_open(file, PDE_DATA(inode), + proc_get_parent_data(inode)); +} + +static const struct file_operations r_file_operations = { + .open = r_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct proc_dir_entry *proc_info_root = NULL; +static const char proc_info_root_name[] = "fs/reiserfs"; + +static void add_file(struct super_block *sb, char *name, + int (*func) (struct seq_file *, void *)) +{ + proc_create_data(name, 0, REISERFS_SB(sb)->procdir, + &r_file_operations, func); +} + +int reiserfs_proc_info_init(struct super_block *sb) +{ + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, sb->s_id, BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + spin_lock_init(&__PINFO(sb).lock); + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); + if (REISERFS_SB(sb)->procdir) { + add_file(sb, "version", show_version); + add_file(sb, "super", show_super); + add_file(sb, "per-level", show_per_level); + add_file(sb, "bitmap", show_bitmap); + add_file(sb, "on-disk-super", show_on_disk_super); + add_file(sb, "oidmap", show_oidmap); + add_file(sb, "journal", show_journal); + return 0; + } + reiserfs_warning(sb, "cannot create /proc/%s/%s", + proc_info_root_name, b); + return 1; +} + +int reiserfs_proc_info_done(struct super_block *sb) +{ + struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; + if (de) { + char b[BDEVNAME_SIZE]; + char *s; + + /* Some block devices use /'s */ + strlcpy(b, sb->s_id, BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; + + remove_proc_subtree(b, proc_info_root); + REISERFS_SB(sb)->procdir = NULL; + } + return 0; +} + +int reiserfs_proc_info_global_init(void) +{ + if (proc_info_root == NULL) { + proc_info_root = proc_mkdir(proc_info_root_name, NULL); + if (!proc_info_root) { + reiserfs_warning(NULL, "cannot create /proc/%s", + proc_info_root_name); + return 1; + } + } + return 0; +} + +int reiserfs_proc_info_global_done(void) +{ + if (proc_info_root != NULL) { + proc_info_root = NULL; + remove_proc_entry(proc_info_root_name, NULL); + } + return 0; +} +/* + * Revision 1.1.8.2 2001/07/15 17:08:42 god + * . use get_super() in procfs.c + * . remove remove_save_link() from reiserfs_do_truncate() + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + * Revision 1.1.8.1 2001/07/11 16:48:50 god + * proc info support + * + * I accept terms and conditions stated in the Legal Agreement + * (available at http://www.namesys.com/legalese.html) + * + */ + +/* + * Make Linus happy. + * Local variables: + * c-indentation-style: "K&R" + * mode-name: "LC" + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h new file mode 100644 index 000000000..2adcde137 --- /dev/null +++ b/fs/reiserfs/reiserfs.h @@ -0,0 +1,3411 @@ +/* + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for + * licensing and copyright details + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* the 32 bit compat definitions with int argument */ +#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) +#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION +#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION + +struct reiserfs_journal_list; + +/* bitmasks for i_flags field in reiserfs-specific part of inode */ +typedef enum { + /* + * this says what format of key do all items (but stat data) of + * an object have. If this is set, that format is 3.6 otherwise - 3.5 + */ + i_item_key_version_mask = 0x0001, + + /* + * If this is unset, object has 3.5 stat data, otherwise, + * it has 3.6 stat data with 64bit size, 32bit nlink etc. + */ + i_stat_data_version_mask = 0x0002, + + /* file might need tail packing on close */ + i_pack_on_close_mask = 0x0004, + + /* don't pack tail of file */ + i_nopack_mask = 0x0008, + + /* + * If either of these are set, "safe link" was created for this + * file during truncate or unlink. Safe link is used to avoid + * leakage of disk space on crash with some files open, but unlinked. + */ + i_link_saved_unlink_mask = 0x0010, + i_link_saved_truncate_mask = 0x0020, + + i_has_xattr_dir = 0x0040, + i_data_log = 0x0080, +} reiserfs_inode_flags; + +struct reiserfs_inode_info { + __u32 i_key[4]; /* key is still 4 32 bit integers */ + + /* + * transient inode flags that are never stored on disk. Bitmasks + * for this field are defined above. + */ + __u32 i_flags; + + /* offset of first byte stored in direct item. */ + __u32 i_first_direct_byte; + + /* copy of persistent inode flags read from sd_attrs. */ + __u32 i_attrs; + + /* first unused block of a sequence of unused blocks */ + int i_prealloc_block; + int i_prealloc_count; /* length of that sequence */ + + /* per-transaction list of inodes which have preallocated blocks */ + struct list_head i_prealloc_list; + + /* + * new_packing_locality is created; new blocks for the contents + * of this directory should be displaced + */ + unsigned new_packing_locality:1; + + /* + * we use these for fsync or O_SYNC to decide which transaction + * needs to be committed in order for this inode to be properly + * flushed + */ + unsigned int i_trans_id; + + struct reiserfs_journal_list *i_jl; + atomic_t openers; + struct mutex tailpack; +#ifdef CONFIG_REISERFS_FS_XATTR + struct rw_semaphore i_xattr_sem; +#endif +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + struct inode vfs_inode; +}; + +typedef enum { + reiserfs_attrs_cleared = 0x00000001, +} reiserfs_super_block_flags; + +/* + * struct reiserfs_super_block accessors/mutators since this is a disk + * structure, it will always be in little endian format. + */ +#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count)) +#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) +#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks)) +#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v)) +#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block)) +#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v)) + +#define sb_jp_journal_1st_block(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block)) +#define set_sb_jp_journal_1st_block(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v)) +#define sb_jp_journal_dev(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev)) +#define set_sb_jp_journal_dev(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v)) +#define sb_jp_journal_size(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size)) +#define set_sb_jp_journal_size(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v)) +#define sb_jp_journal_trans_max(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max)) +#define set_sb_jp_journal_trans_max(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v)) +#define sb_jp_journal_magic(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic)) +#define set_sb_jp_journal_magic(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v)) +#define sb_jp_journal_max_batch(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch)) +#define set_sb_jp_journal_max_batch(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v)) +#define sb_jp_jourmal_max_commit_age(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age)) +#define set_sb_jp_journal_max_commit_age(sbp,v) \ + ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v)) + +#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize)) +#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v)) +#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize)) +#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v)) +#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize)) +#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v)) +#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state)) +#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v)) +#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state)) +#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v)) +#define sb_hash_function_code(sbp) \ + (le32_to_cpu((sbp)->s_v1.s_hash_function_code)) +#define set_sb_hash_function_code(sbp,v) \ + ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v)) +#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height)) +#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v)) +#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr)) +#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v)) +#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version)) +#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v)) + +#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count)) +#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v)) + +#define sb_reserved_for_journal(sbp) \ + (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal)) +#define set_sb_reserved_for_journal(sbp,v) \ + ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v)) + +/* LOGGING -- */ + +/* + * These all interelate for performance. + * + * If the journal block count is smaller than n transactions, you lose speed. + * I don't know what n is yet, I'm guessing 8-16. + * + * typical transaction size depends on the application, how often fsync is + * called, and how many metadata blocks you dirty in a 30 second period. + * The more small files (<16k) you use, the larger your transactions will + * be. + * + * If your journal fills faster than dirty buffers get flushed to disk, it + * must flush them before allowing the journal to wrap, which slows things + * down. If you need high speed meta data updates, the journal should be + * big enough to prevent wrapping before dirty meta blocks get to disk. + * + * If the batch max is smaller than the transaction max, you'll waste space + * at the end of the journal because journal_end sets the next transaction + * to start at 0 if the next transaction has any chance of wrapping. + * + * The large the batch max age, the better the speed, and the more meta + * data changes you'll lose after a crash. + */ + +/* don't mess with these for a while */ +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ +#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ +#define JOURNAL_HASH_SIZE 8192 + +/* number of copies of the bitmaps to have floating. Must be >= 2 */ +#define JOURNAL_NUM_BITMAPS 5 + +/* + * One of these for every block in every transaction + * Each one is in two hash tables. First, a hash of the current transaction, + * and after journal_end, a hash of all the in memory transactions. + * next and prev are used by the current transaction (journal_hash). + * hnext and hprev are used by journal_list_hash. If a block is in more + * than one transaction, the journal_list_hash links it in multiple times. + * This allows flush_journal_list to remove just the cnode belonging to a + * given transaction. + */ +struct reiserfs_journal_cnode { + struct buffer_head *bh; /* real buffer head */ + struct super_block *sb; /* dev of real buffer head */ + + /* block number of real buffer head, == 0 when buffer on disk */ + __u32 blocknr; + + unsigned long state; + + /* journal list this cnode lives in */ + struct reiserfs_journal_list *jlist; + + struct reiserfs_journal_cnode *next; /* next in transaction list */ + struct reiserfs_journal_cnode *prev; /* prev in transaction list */ + struct reiserfs_journal_cnode *hprev; /* prev in hash list */ + struct reiserfs_journal_cnode *hnext; /* next in hash list */ +}; + +struct reiserfs_bitmap_node { + int id; + char *data; + struct list_head list; +}; + +struct reiserfs_list_bitmap { + struct reiserfs_journal_list *journal_list; + struct reiserfs_bitmap_node **bitmaps; +}; + +/* + * one of these for each transaction. The most important part here is the + * j_realblock. this list of cnodes is used to hash all the blocks in all + * the commits, to mark all the real buffer heads dirty once all the commits + * hit the disk, and to make sure every real block in a transaction is on + * disk before allowing the log area to be overwritten + */ +struct reiserfs_journal_list { + unsigned long j_start; + unsigned long j_state; + unsigned long j_len; + atomic_t j_nonzerolen; + atomic_t j_commit_left; + + /* all commits older than this on disk */ + atomic_t j_older_commits_done; + + struct mutex j_commit_mutex; + unsigned int j_trans_id; + time_t j_timestamp; + struct reiserfs_list_bitmap *j_list_bitmap; + struct buffer_head *j_commit_bh; /* commit buffer head */ + struct reiserfs_journal_cnode *j_realblock; + struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */ + /* time ordered list of all active transactions */ + struct list_head j_list; + + /* + * time ordered list of all transactions we haven't tried + * to flush yet + */ + struct list_head j_working_list; + + /* list of tail conversion targets in need of flush before commit */ + struct list_head j_tail_bh_list; + + /* list of data=ordered buffers in need of flush before commit */ + struct list_head j_bh_list; + int j_refcount; +}; + +struct reiserfs_journal { + struct buffer_head **j_ap_blocks; /* journal blocks on disk */ + /* newest journal block */ + struct reiserfs_journal_cnode *j_last; + + /* oldest journal block. start here for traverse */ + struct reiserfs_journal_cnode *j_first; + + struct block_device *j_dev_bd; + fmode_t j_dev_mode; + + /* first block on s_dev of reserved area journal */ + int j_1st_reserved_block; + + unsigned long j_state; + unsigned int j_trans_id; + unsigned long j_mount_id; + + /* start of current waiting commit (index into j_ap_blocks) */ + unsigned long j_start; + unsigned long j_len; /* length of current waiting commit */ + + /* number of buffers requested by journal_begin() */ + unsigned long j_len_alloc; + + atomic_t j_wcount; /* count of writers for current commit */ + + /* batch count. allows turning X transactions into 1 */ + unsigned long j_bcount; + + /* first unflushed transactions offset */ + unsigned long j_first_unflushed_offset; + + /* last fully flushed journal timestamp */ + unsigned j_last_flush_trans_id; + + struct buffer_head *j_header_bh; + + time_t j_trans_start_time; /* time this transaction started */ + struct mutex j_mutex; + struct mutex j_flush_mutex; + + /* wait for current transaction to finish before starting new one */ + wait_queue_head_t j_join_wait; + + atomic_t j_jlock; /* lock for j_join_wait */ + int j_list_bitmap_index; /* number of next list bitmap to use */ + + /* no more journal begins allowed. MUST sleep on j_join_wait */ + int j_must_wait; + + /* next journal_end will flush all journal list */ + int j_next_full_flush; + + /* next journal_end will flush all async commits */ + int j_next_async_flush; + + int j_cnode_used; /* number of cnodes on the used list */ + int j_cnode_free; /* number of cnodes on the free list */ + + /* max number of blocks in a transaction. */ + unsigned int j_trans_max; + + /* max number of blocks to batch into a trans */ + unsigned int j_max_batch; + + /* in seconds, how old can an async commit be */ + unsigned int j_max_commit_age; + + /* in seconds, how old can a transaction be */ + unsigned int j_max_trans_age; + + /* the default for the max commit age */ + unsigned int j_default_max_commit_age; + + struct reiserfs_journal_cnode *j_cnode_free_list; + + /* orig pointer returned from vmalloc */ + struct reiserfs_journal_cnode *j_cnode_free_orig; + + struct reiserfs_journal_list *j_current_jl; + int j_free_bitmap_nodes; + int j_used_bitmap_nodes; + + int j_num_lists; /* total number of active transactions */ + int j_num_work_lists; /* number that need attention from kreiserfsd */ + + /* debugging to make sure things are flushed in order */ + unsigned int j_last_flush_id; + + /* debugging to make sure things are committed in order */ + unsigned int j_last_commit_id; + + struct list_head j_bitmap_nodes; + struct list_head j_dirty_buffers; + spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */ + + /* list of all active transactions */ + struct list_head j_journal_list; + + /* lists that haven't been touched by writeback attempts */ + struct list_head j_working_list; + + /* hash table for real buffer heads in current trans */ + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; + + /* hash table for all the real buffer heads in all the transactions */ + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; + + /* array of bitmaps to record the deleted blocks */ + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; + + /* list of inodes which have preallocated blocks */ + struct list_head j_prealloc_list; + int j_persistent_trans; + unsigned long j_max_trans_size; + unsigned long j_max_batch_size; + + int j_errno; + + /* when flushing ordered buffers, throttle new ordered writers */ + struct delayed_work j_work; + struct super_block *j_work_sb; + atomic_t j_async_throttle; +}; + +enum journal_state_bits { + J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */ + J_WRITERS_QUEUED, /* set when log is full due to too many writers */ + J_ABORTED, /* set when log is aborted */ +}; + +/* ick. magic string to find desc blocks in the journal */ +#define JOURNAL_DESC_MAGIC "ReIsErLB" + +typedef __u32(*hashf_t) (const signed char *, int); + +struct reiserfs_bitmap_info { + __u32 free_count; +}; + +struct proc_dir_entry; + +#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO ) +typedef unsigned long int stat_cnt_t; +typedef struct reiserfs_proc_info_data { + spinlock_t lock; + int exiting; + int max_hash_collisions; + + stat_cnt_t breads; + stat_cnt_t bread_miss; + stat_cnt_t search_by_key; + stat_cnt_t search_by_key_fs_changed; + stat_cnt_t search_by_key_restarted; + + stat_cnt_t insert_item_restarted; + stat_cnt_t paste_into_item_restarted; + stat_cnt_t cut_from_item_restarted; + stat_cnt_t delete_solid_item_restarted; + stat_cnt_t delete_item_restarted; + + stat_cnt_t leaked_oid; + stat_cnt_t leaves_removable; + + /* + * balances per level. + * Use explicit 5 as MAX_HEIGHT is not visible yet. + */ + stat_cnt_t balance_at[5]; /* XXX */ + /* sbk == search_by_key */ + stat_cnt_t sbk_read_at[5]; /* XXX */ + stat_cnt_t sbk_fs_changed[5]; + stat_cnt_t sbk_restarted[5]; + stat_cnt_t items_at[5]; /* XXX */ + stat_cnt_t free_at[5]; /* XXX */ + stat_cnt_t can_node_be_removed[5]; /* XXX */ + long int lnum[5]; /* XXX */ + long int rnum[5]; /* XXX */ + long int lbytes[5]; /* XXX */ + long int rbytes[5]; /* XXX */ + stat_cnt_t get_neighbors[5]; + stat_cnt_t get_neighbors_restart[5]; + stat_cnt_t need_l_neighbor[5]; + stat_cnt_t need_r_neighbor[5]; + + stat_cnt_t free_block; + struct __scan_bitmap_stats { + stat_cnt_t call; + stat_cnt_t wait; + stat_cnt_t bmap; + stat_cnt_t retry; + stat_cnt_t in_journal_hint; + stat_cnt_t in_journal_nohint; + stat_cnt_t stolen; + } scan_bitmap; + struct __journal_stats { + stat_cnt_t in_journal; + stat_cnt_t in_journal_bitmap; + stat_cnt_t in_journal_reusable; + stat_cnt_t lock_journal; + stat_cnt_t lock_journal_wait; + stat_cnt_t journal_being; + stat_cnt_t journal_relock_writers; + stat_cnt_t journal_relock_wcount; + stat_cnt_t mark_dirty; + stat_cnt_t mark_dirty_already; + stat_cnt_t mark_dirty_notjournal; + stat_cnt_t restore_prepared; + stat_cnt_t prepare; + stat_cnt_t prepare_retry; + } journal; +} reiserfs_proc_info_data_t; +#else +typedef struct reiserfs_proc_info_data { +} reiserfs_proc_info_data_t; +#endif + +/* Number of quota types we support */ +#define REISERFS_MAXQUOTAS 2 + +/* reiserfs union of in-core super block data */ +struct reiserfs_sb_info { + /* Buffer containing the super block */ + struct buffer_head *s_sbh; + + /* Pointer to the on-disk super block in the buffer */ + struct reiserfs_super_block *s_rs; + struct reiserfs_bitmap_info *s_ap_bitmap; + + /* pointer to journal information */ + struct reiserfs_journal *s_journal; + + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ + + /* Serialize writers access, replace the old bkl */ + struct mutex lock; + + /* Owner of the lock (can be recursive) */ + struct task_struct *lock_owner; + + /* Depth of the lock, start from -1 like the bkl */ + int lock_depth; + + struct workqueue_struct *commit_wq; + + /* Comment? -Hans */ + void (*end_io_handler) (struct buffer_head *, int); + + /* + * pointer to function which is used to sort names in directory. + * Set on mount + */ + hashf_t s_hash_function; + + /* reiserfs's mount options are set here */ + unsigned long s_mount_opt; + + /* This is a structure that describes block allocator options */ + struct { + /* Bitfield for enable/disable kind of options */ + unsigned long bits; + + /* + * size started from which we consider file + * to be a large one (in blocks) + */ + unsigned long large_file_size; + + int border; /* percentage of disk, border takes */ + + /* + * Minimal file size (in blocks) starting + * from which we do preallocations + */ + int preallocmin; + + /* + * Number of blocks we try to prealloc when file + * reaches preallocmin size (in blocks) or prealloc_list + is empty. + */ + int preallocsize; + } s_alloc_options; + + /* Comment? -Hans */ + wait_queue_head_t s_wait; + /* increased by one every time the tree gets re-balanced */ + atomic_t s_generation_counter; + + /* File system properties. Currently holds on-disk FS format */ + unsigned long s_properties; + + /* session statistics */ + int s_disk_reads; + int s_disk_writes; + int s_fix_nodes; + int s_do_balance; + int s_unneeded_left_neighbor; + int s_good_search_by_key_reada; + int s_bmaps; + int s_bmaps_without_search; + int s_direct2indirect; + int s_indirect2direct; + + /* + * set up when it's ok for reiserfs_read_inode2() to read from + * disk inode with nlink==0. Currently this is only used during + * finish_unfinished() processing at mount time + */ + int s_is_unlinked_ok; + + reiserfs_proc_info_data_t s_proc_info_data; + struct proc_dir_entry *procdir; + + /* amount of blocks reserved for further allocations */ + int reserved_blocks; + + + /* this lock on now only used to protect reserved_blocks variable */ + spinlock_t bitmap_lock; + struct dentry *priv_root; /* root of /.reiserfs_priv */ + struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ + int j_errno; + + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work old_work; /* old transactions flush delayed work */ + spinlock_t old_work_lock; /* protects old_work and work_queued */ + +#ifdef CONFIG_QUOTA + char *s_qf_names[REISERFS_MAXQUOTAS]; + int s_jquota_fmt; +#endif + char *s_jdev; /* Stored jdev for mount option showing */ +#ifdef CONFIG_REISERFS_CHECK + + /* + * Detects whether more than one copy of tb exists per superblock + * as a means of checking whether do_balance is executing + * concurrently against another tree reader/writer on a same + * mount point. + */ + struct tree_balance *cur_tb; +#endif +}; + +/* Definitions of reiserfs on-disk properties: */ +#define REISERFS_3_5 0 +#define REISERFS_3_6 1 +#define REISERFS_OLD_FORMAT 2 + +/* Mount options */ +enum reiserfs_mount_options { + /* large tails will be created in a session */ + REISERFS_LARGETAIL, + /* + * small (for files less than block size) tails will + * be created in a session + */ + REISERFS_SMALLTAIL, + + /* replay journal and return 0. Use by fsck */ + REPLAYONLY, + + /* + * -o conv: causes conversion of old format super block to the + * new format. If not specified - old partition will be dealt + * with in a manner of 3.5.x + */ + REISERFS_CONVERT, + + /* + * -o hash={tea, rupasov, r5, detect} is meant for properly mounting + * reiserfs disks from 3.5.19 or earlier. 99% of the time, this + * option is not required. If the normal autodection code can't + * determine which hash to use (because both hashes had the same + * value for a file) use this option to force a specific hash. + * It won't allow you to override the existing hash on the FS, so + * if you have a tea hash disk, and mount with -o hash=rupasov, + * the mount will fail. + */ + FORCE_TEA_HASH, /* try to force tea hash on mount */ + FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ + FORCE_R5_HASH, /* try to force rupasov hash on mount */ + FORCE_HASH_DETECT, /* try to detect hash function on mount */ + + REISERFS_DATA_LOG, + REISERFS_DATA_ORDERED, + REISERFS_DATA_WRITEBACK, + + /* + * used for testing experimental features, makes benchmarking new + * features with and without more convenient, should never be used by + * users in any code shipped to users (ideally) + */ + + REISERFS_NO_BORDER, + REISERFS_NO_UNHASHED_RELOCATION, + REISERFS_HASHED_RELOCATION, + REISERFS_ATTRS, + REISERFS_XATTRS_USER, + REISERFS_POSIXACL, + REISERFS_EXPOSE_PRIVROOT, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, + + /* Actions on error */ + REISERFS_ERROR_PANIC, + REISERFS_ERROR_RO, + REISERFS_ERROR_CONTINUE, + + REISERFS_USRQUOTA, /* User quota option specified */ + REISERFS_GRPQUOTA, /* Group quota option specified */ + + REISERFS_TEST1, + REISERFS_TEST2, + REISERFS_TEST3, + REISERFS_TEST4, + REISERFS_UNSUPPORTED_OPT, +}; + +#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH)) +#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH)) +#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH)) +#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT)) +#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER)) +#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION)) +#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION)) +#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4)) + +#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL)) +#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL)) +#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY)) +#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS)) +#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5)) +#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT)) +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG)) +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED)) +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) +#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT)) +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) + +#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC)) +#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO)) + +void reiserfs_file_buffer(struct buffer_head *bh, int list); +extern struct file_system_type reiserfs_fs_type; +int reiserfs_resize(struct super_block *, unsigned long); + +#define CARRY_ON 0 +#define SCHEDULE_OCCURRED 1 + +#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) +#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) +#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) +#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) + +#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) + +#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) +static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal + *journal) +{ + return test_bit(J_ABORTED, &journal->j_state); +} + +/* + * Locking primitives. The write lock is a per superblock + * special mutex that has properties close to the Big Kernel Lock + * which was used in the previous locking scheme. + */ +void reiserfs_write_lock(struct super_block *s); +void reiserfs_write_unlock(struct super_block *s); +int __must_check reiserfs_write_unlock_nested(struct super_block *s); +void reiserfs_write_lock_nested(struct super_block *s, int depth); + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *s); +#else +static inline void reiserfs_lock_check_recursive(struct super_block *s) { } +#endif + +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + * - The inode mutex + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, + struct super_block *s) +{ + int depth; + + depth = reiserfs_write_unlock_nested(s); + mutex_lock(m); + reiserfs_write_lock_nested(s, depth); +} + +static inline void +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, + struct super_block *s) +{ + int depth; + + depth = reiserfs_write_unlock_nested(s); + mutex_lock_nested(m, subclass); + reiserfs_write_lock_nested(s, depth); +} + +static inline void +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) +{ + int depth; + depth = reiserfs_write_unlock_nested(s); + down_read(sem); + reiserfs_write_lock_nested(s, depth); +} + +/* + * When we schedule, we usually want to also release the write lock, + * according to the previous bkl based locking scheme of reiserfs. + */ +static inline void reiserfs_cond_resched(struct super_block *s) +{ + if (need_resched()) { + int depth; + + depth = reiserfs_write_unlock_nested(s); + schedule(); + reiserfs_write_lock_nested(s, depth); + } +} + +struct fid; + +/* + * in reading the #defines, it may help to understand that they employ + * the following abbreviations: + * + * B = Buffer + * I = Item header + * H = Height within the tree (should be changed to LEV) + * N = Number of the item in the node + * STAT = stat data + * DEH = Directory Entry Header + * EC = Entry Count + * E = Entry number + * UL = Unsigned Long + * BLKH = BLocK Header + * UNFM = UNForMatted node + * DC = Disk Child + * P = Path + * + * These #defines are named by concatenating these abbreviations, + * where first comes the arguments, and last comes the return value, + * of the macro. + */ + +#define USE_INODE_GENERATION_COUNTER + +#define REISERFS_PREALLOCATE +#define DISPLACE_NEW_PACKING_LOCALITIES +#define PREALLOCATION_SIZE 9 + +/* n must be power of 2 */ +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) + +/* + * to be ok for alpha and others we have to align structures to 8 byte + * boundary. + * FIXME: do not change 4 by anything else: there is code which relies on that + */ +#define ROUND_UP(x) _ROUND_UP(x,8LL) + +/* + * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug + * messages. + */ +#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ + +void __reiserfs_warning(struct super_block *s, const char *id, + const char *func, const char *fmt, ...); +#define reiserfs_warning(s, id, fmt, args...) \ + __reiserfs_warning(s, id, __func__, fmt, ##args) +/* assertions handling */ + +/* always check a condition and panic if it's false. */ +#define __RASSERT(cond, scond, format, args...) \ +do { \ + if (!(cond)) \ + reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \ + __FILE__ ":%i:%s: " format "\n", \ + __LINE__, __func__ , ##args); \ +} while (0) + +#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args) + +#if defined( CONFIG_REISERFS_CHECK ) +#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args) +#else +#define RFALSE( cond, format, args... ) do {;} while( 0 ) +#endif + +#define CONSTF __attribute_const__ +/* + * Disk Data Structures + */ + +/*************************************************************************** + * SUPER BLOCK * + ***************************************************************************/ + +/* + * Structure of super block on disk, a version of which in RAM is often + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger + * structure containing fields never written to disk. + */ +#define UNSET_HASH 0 /* Detect hash on disk */ +#define TEA_HASH 1 +#define YURA_HASH 2 +#define R5_HASH 3 +#define DEFAULT_HASH R5_HASH + +struct journal_params { + /* where does journal start from on its * device */ + __le32 jp_journal_1st_block; + + /* journal device st_rdev */ + __le32 jp_journal_dev; + + /* size of the journal */ + __le32 jp_journal_size; + + /* max number of blocks in a transaction. */ + __le32 jp_journal_trans_max; + + /* + * random value made on fs creation + * (this was sb_journal_block_count) + */ + __le32 jp_journal_magic; + + /* max number of blocks to batch into a trans */ + __le32 jp_journal_max_batch; + + /* in seconds, how old can an async commit be */ + __le32 jp_journal_max_commit_age; + + /* in seconds, how old can a transaction be */ + __le32 jp_journal_max_trans_age; +}; + +/* this is the super from 3.5.X, where X >= 10 */ +struct reiserfs_super_block_v1 { + __le32 s_block_count; /* blocks count */ + __le32 s_free_blocks; /* free blocks count */ + __le32 s_root_block; /* root block number */ + struct journal_params s_journal; + __le16 s_blocksize; /* block size */ + + /* max size of object id array, see get_objectid() commentary */ + __le16 s_oid_maxsize; + __le16 s_oid_cursize; /* current size of object id array */ + + /* this is set to 1 when filesystem was umounted, to 2 - when not */ + __le16 s_umount_state; + + /* + * reiserfs magic string indicates that file system is reiserfs: + * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" + */ + char s_magic[10]; + + /* + * it is set to used by fsck to mark which + * phase of rebuilding is done + */ + __le16 s_fs_state; + /* + * indicate, what hash function is being use + * to sort names in a directory + */ + __le32 s_hash_function_code; + __le16 s_tree_height; /* height of disk tree */ + + /* + * amount of bitmap blocks needed to address + * each block of file system + */ + __le16 s_bmap_nr; + + /* + * this field is only reliable on filesystem with non-standard journal + */ + __le16 s_version; + + /* + * size in blocks of journal area on main device, we need to + * keep after making fs with non-standard journal + */ + __le16 s_reserved_for_journal; +} __attribute__ ((__packed__)); + +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) + +/* this is the on disk super block */ +struct reiserfs_super_block { + struct reiserfs_super_block_v1 s_v1; + __le32 s_inode_generation; + + /* Right now used only by inode-attributes, if enabled */ + __le32 s_flags; + + unsigned char s_uuid[16]; /* filesystem unique identifier */ + unsigned char s_label[16]; /* filesystem volume label */ + __le16 s_mnt_count; /* Count of mounts since last fsck */ + __le16 s_max_mnt_count; /* Maximum mounts before check */ + __le32 s_lastcheck; /* Timestamp of last fsck */ + __le32 s_check_interval; /* Interval between checks */ + + /* + * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1() + * so any additions must be updated there as well. */ + char s_unused[76]; +} __attribute__ ((__packed__)); + +#define SB_SIZE (sizeof(struct reiserfs_super_block)) + +#define REISERFS_VERSION_1 0 +#define REISERFS_VERSION_2 2 + +/* on-disk super block fields converted to cpu form */ +#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) +#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) +#define SB_BLOCKSIZE(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize)) +#define SB_BLOCK_COUNT(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count)) +#define SB_FREE_BLOCKS(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks)) +#define SB_REISERFS_MAGIC(s) \ + (SB_V1_DISK_SUPER_BLOCK(s)->s_magic) +#define SB_ROOT_BLOCK(s) \ + le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block)) +#define SB_TREE_HEIGHT(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height)) +#define SB_REISERFS_STATE(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state)) +#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version)) +#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr)) + +#define PUT_SB_BLOCK_COUNT(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0) +#define PUT_SB_FREE_BLOCKS(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0) +#define PUT_SB_ROOT_BLOCK(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0) +#define PUT_SB_TREE_HEIGHT(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0) +#define PUT_SB_REISERFS_STATE(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0) +#define PUT_SB_VERSION(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0) +#define PUT_SB_BMAP_NR(s, val) \ + do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0) + +#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal) +#define SB_ONDISK_JOURNAL_SIZE(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size)) +#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block)) +#define SB_ONDISK_JOURNAL_DEVICE(s) \ + le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev)) +#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \ + le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal)) + +#define is_block_in_log_or_reserved_area(s, block) \ + block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \ + && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \ + ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \ + SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s))) + +int is_reiserfs_3_5(struct reiserfs_super_block *rs); +int is_reiserfs_3_6(struct reiserfs_super_block *rs); +int is_reiserfs_jr(struct reiserfs_super_block *rs); + +/* + * ReiserFS leaves the first 64k unused, so that partition labels have + * enough space. If someone wants to write a fancy bootloader that + * needs more than 64k, let us know, and this will be increased in size. + * This number must be larger than than the largest block size on any + * platform, or code will break. -Hans + */ +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) +#define REISERFS_FIRST_BLOCK unused_define +#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES + +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */ +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024) + +/* reiserfs internal error code (used by search_by_key and fix_nodes)) */ +#define CARRY_ON 0 +#define REPEAT_SEARCH -1 +#define IO_ERROR -2 +#define NO_DISK_SPACE -3 +#define NO_BALANCING_NEEDED (-4) +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) +#define QUOTA_EXCEEDED -6 + +typedef __u32 b_blocknr_t; +typedef __le32 unp_t; + +struct unfm_nodeinfo { + unp_t unfm_nodenum; + unsigned short unfm_freespace; +}; + +/* there are two formats of keys: 3.5 and 3.6 */ +#define KEY_FORMAT_3_5 0 +#define KEY_FORMAT_3_6 1 + +/* there are two stat datas */ +#define STAT_DATA_V1 0 +#define STAT_DATA_V2 1 + +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode) +{ + return container_of(inode, struct reiserfs_inode_info, vfs_inode); +} + +static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. + */ +static inline __u32 reiserfs_bmap_count(struct super_block *sb) +{ + return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; +} + +static inline int bmap_would_wrap(unsigned bmap_nr) +{ + return bmap_nr > ((1LL << 16) - 1); +} + +/* + * this says about version of key of all items (but stat data) the + * object consists of + */ +#define get_inode_item_key_version( inode ) \ + ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) + +#define set_inode_item_key_version( inode, version ) \ + ({ if((version)==KEY_FORMAT_3_6) \ + REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \ + else \ + REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; }) + +#define get_inode_sd_version(inode) \ + ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1) + +#define set_inode_sd_version(inode, version) \ + ({ if((version)==STAT_DATA_V2) \ + REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \ + else \ + REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) + +/* + * This is an aggressive tail suppression policy, I am hoping it + * improves our benchmarks. The principle behind it is that percentage + * space saving is what matters, not absolute space saving. This is + * non-intuitive, but it helps to understand it if you consider that the + * cost to access 4 blocks is not much more than the cost to access 1 + * block, if you have to do a seek and rotate. A tail risks a + * non-linear disk access that is significant as a percentage of total + * time cost for a 4 block file and saves an amount of space that is + * less significant as a percentage of space, or so goes the hypothesis. + * -Hans + */ +#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ +(\ + (!(n_tail_size)) || \ + (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ + ( (n_file_size) >= (n_block_size) * 4 ) || \ + ( ( (n_file_size) >= (n_block_size) * 3 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \ + ( ( (n_file_size) >= (n_block_size) * 2 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \ + ( ( (n_file_size) >= (n_block_size) ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ +) + +/* + * Another strategy for tails, this one means only create a tail if all the + * file would fit into one DIRECT item. + * Primary intention for this one is to increase performance by decreasing + * seeking. +*/ +#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ +(\ + (!(n_tail_size)) || \ + (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \ +) + +/* + * values for s_umount_state field + */ +#define REISERFS_VALID_FS 1 +#define REISERFS_ERROR_FS 2 + +/* + * there are 5 item types currently + */ +#define TYPE_STAT_DATA 0 +#define TYPE_INDIRECT 1 +#define TYPE_DIRECT 2 +#define TYPE_DIRENTRY 3 +#define TYPE_MAXTYPE 3 +#define TYPE_ANY 15 /* FIXME: comment is required */ + +/*************************************************************************** + * KEY & ITEM HEAD * + ***************************************************************************/ + +/* * directories use this key as well as old files */ +struct offset_v1 { + __le32 k_offset; + __le32 k_uniqueness; +} __attribute__ ((__packed__)); + +struct offset_v2 { + __le64 v; +} __attribute__ ((__packed__)); + +static inline __u16 offset_v2_k_type(const struct offset_v2 *v2) +{ + __u8 type = le64_to_cpu(v2->v) >> 60; + return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY; +} + +static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type) +{ + v2->v = + (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60); +} + +static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2) +{ + return le64_to_cpu(v2->v) & (~0ULL >> 4); +} + +static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) +{ + offset &= (~0ULL >> 4); + v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); +} + +/* + * Key of an item determines its location in the S+tree, and + * is composed of 4 components + */ +struct reiserfs_key { + /* packing locality: by default parent directory object id */ + __le32 k_dir_id; + + __le32 k_objectid; /* object identifier */ + union { + struct offset_v1 k_offset_v1; + struct offset_v2 k_offset_v2; + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + +struct in_core_key { + /* packing locality: by default parent directory object id */ + __u32 k_dir_id; + __u32 k_objectid; /* object identifier */ + __u64 k_offset; + __u8 k_type; +}; + +struct cpu_key { + struct in_core_key on_disk_key; + int version; + /* 3 in all cases but direct2indirect and indirect2direct conversion */ + int key_length; +}; + +/* + * Our function for comparing keys can compare keys of different + * lengths. It takes as a parameter the length of the keys it is to + * compare. These defines are used in determining what is to be passed + * to it as that parameter. + */ +#define REISERFS_FULL_KEY_LEN 4 +#define REISERFS_SHORT_KEY_LEN 2 + +/* The result of the key compare */ +#define FIRST_GREATER 1 +#define SECOND_GREATER -1 +#define KEYS_IDENTICAL 0 +#define KEY_FOUND 1 +#define KEY_NOT_FOUND 0 + +#define KEY_SIZE (sizeof(struct reiserfs_key)) +#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) + +/* return values for search_by_key and clones */ +#define ITEM_FOUND 1 +#define ITEM_NOT_FOUND 0 +#define ENTRY_FOUND 1 +#define ENTRY_NOT_FOUND 0 +#define DIRECTORY_NOT_FOUND -1 +#define REGULAR_FILE_FOUND -2 +#define DIRECTORY_FOUND -3 +#define BYTE_FOUND 1 +#define BYTE_NOT_FOUND 0 +#define FILE_NOT_FOUND -1 + +#define POSITION_FOUND 1 +#define POSITION_NOT_FOUND 0 + +/* return values for reiserfs_find_entry and search_by_entry_key */ +#define NAME_FOUND 1 +#define NAME_NOT_FOUND 0 +#define GOTO_PREVIOUS_ITEM 2 +#define NAME_FOUND_INVISIBLE 3 + +/* + * Everything in the filesystem is stored as a set of items. The + * item head contains the key of the item, its free space (for + * indirect items) and specifies the location of the item itself + * within the block. + */ + +struct item_head { + /* + * Everything in the tree is found by searching for it based on + * its key. + */ + struct reiserfs_key ih_key; + union { + /* + * The free space in the last unformatted node of an + * indirect item if this is an indirect item. This + * equals 0xFFFF iff this is a direct item or stat data + * item. Note that the key, not this field, is used to + * determine the item type, and thus which field this + * union contains. + */ + __le16 ih_free_space_reserved; + + /* + * Iff this is a directory item, this field equals the + * number of directory entries in the directory item. + */ + __le16 ih_entry_count; + } __attribute__ ((__packed__)) u; + __le16 ih_item_len; /* total size of the item body */ + + /* an offset to the item body within the block */ + __le16 ih_item_location; + + /* + * 0 for all old items, 2 for new ones. Highest bit is set by fsck + * temporary, cleaned after all done + */ + __le16 ih_version; +} __attribute__ ((__packed__)); +/* size of item header */ +#define IH_SIZE (sizeof(struct item_head)) + +#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved) +#define ih_version(ih) le16_to_cpu((ih)->ih_version) +#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count) +#define ih_location(ih) le16_to_cpu((ih)->ih_item_location) +#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len) + +#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0) +#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0) +#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0) +#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0) +#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0) + +#define unreachable_item(ih) (ih_version(ih) & (1 << 15)) + +#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) + +/* + * these operate on indirect items, where you've got an array of ints + * at a possibly unaligned location. These are a noop on ia32 + * + * p is the array of __u32, i is the index into the array, v is the value + * to store there. + */ +#define get_block_num(p, i) get_unaligned_le32((p) + (i)) +#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) + +/* * in old version uniqueness field shows key type */ +#define V1_SD_UNIQUENESS 0 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe +#define V1_DIRECT_UNIQUENESS 0xffffffff +#define V1_DIRENTRY_UNIQUENESS 500 +#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */ + +/* here are conversion routines */ +static inline int uniqueness2type(__u32 uniqueness) CONSTF; +static inline int uniqueness2type(__u32 uniqueness) +{ + switch ((int)uniqueness) { + case V1_SD_UNIQUENESS: + return TYPE_STAT_DATA; + case V1_INDIRECT_UNIQUENESS: + return TYPE_INDIRECT; + case V1_DIRECT_UNIQUENESS: + return TYPE_DIRECT; + case V1_DIRENTRY_UNIQUENESS: + return TYPE_DIRENTRY; + case V1_ANY_UNIQUENESS: + default: + return TYPE_ANY; + } +} + +static inline __u32 type2uniqueness(int type) CONSTF; +static inline __u32 type2uniqueness(int type) +{ + switch (type) { + case TYPE_STAT_DATA: + return V1_SD_UNIQUENESS; + case TYPE_INDIRECT: + return V1_INDIRECT_UNIQUENESS; + case TYPE_DIRECT: + return V1_DIRECT_UNIQUENESS; + case TYPE_DIRENTRY: + return V1_DIRENTRY_UNIQUENESS; + case TYPE_ANY: + default: + return V1_ANY_UNIQUENESS; + } +} + +/* + * key is pointer to on disk key which is stored in le, result is cpu, + * there is no way to get version of object from key, so, provide + * version to these defines + */ +static inline loff_t le_key_k_offset(int version, + const struct reiserfs_key *key) +{ + return (version == KEY_FORMAT_3_5) ? + le32_to_cpu(key->u.k_offset_v1.k_offset) : + offset_v2_k_offset(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_offset(const struct item_head *ih) +{ + return le_key_k_offset(ih_version(ih), &(ih->ih_key)); +} + +static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) +{ + if (version == KEY_FORMAT_3_5) { + loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness); + return uniqueness2type(val); + } else + return offset_v2_k_type(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_type(const struct item_head *ih) +{ + return le_key_k_type(ih_version(ih), &(ih->ih_key)); +} + +static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + if (version == KEY_FORMAT_3_5) + key->u.k_offset_v1.k_offset = cpu_to_le32(offset); + else + set_offset_v2_k_offset(&key->u.k_offset_v2, offset); +} + +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + set_le_key_k_offset(version, key, + le_key_k_offset(version, key) + offset); +} + +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); +} + +static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); +} + +static inline void set_le_key_k_type(int version, struct reiserfs_key *key, + int type) +{ + if (version == KEY_FORMAT_3_5) { + type = type2uniqueness(type); + key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type); + } else + set_offset_v2_k_type(&key->u.k_offset_v2, type); +} + +static inline void set_le_ih_k_type(struct item_head *ih, int type) +{ + set_le_key_k_type(ih_version(ih), &(ih->ih_key), type); +} + +static inline int is_direntry_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_DIRENTRY; +} + +static inline int is_direct_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_DIRECT; +} + +static inline int is_indirect_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_INDIRECT; +} + +static inline int is_statdata_le_key(int version, struct reiserfs_key *key) +{ + return le_key_k_type(version, key) == TYPE_STAT_DATA; +} + +/* item header has version. */ +static inline int is_direntry_le_ih(struct item_head *ih) +{ + return is_direntry_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_direct_le_ih(struct item_head *ih) +{ + return is_direct_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_indirect_le_ih(struct item_head *ih) +{ + return is_indirect_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_statdata_le_ih(struct item_head *ih) +{ + return is_statdata_le_key(ih_version(ih), &ih->ih_key); +} + +/* key is pointer to cpu key, result is cpu */ +static inline loff_t cpu_key_k_offset(const struct cpu_key *key) +{ + return key->on_disk_key.k_offset; +} + +static inline loff_t cpu_key_k_type(const struct cpu_key *key) +{ + return key->on_disk_key.k_type; +} + +static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset) +{ + key->on_disk_key.k_offset = offset; +} + +static inline void set_cpu_key_k_type(struct cpu_key *key, int type) +{ + key->on_disk_key.k_type = type; +} + +static inline void cpu_key_k_offset_dec(struct cpu_key *key) +{ + key->on_disk_key.k_offset--; +} + +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY) +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT) +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT) +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA) + +/* are these used ? */ +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key))) +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key))) +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key))) +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key))) + +#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \ + (!COMP_SHORT_KEYS(ih, key) && \ + I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize)) + +/* maximal length of item */ +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE) +#define MIN_ITEM_LEN 1 + +/* object identifier for root dir */ +#define REISERFS_ROOT_OBJECTID 2 +#define REISERFS_ROOT_PARENT_OBJECTID 1 + +extern struct reiserfs_key root_key; + +/* + * Picture represents a leaf of the S+tree + * ______________________________________________________ + * | | Array of | | | + * |Block | Object-Item | F r e e | Objects- | + * | head | Headers | S p a c e | Items | + * |______|_______________|___________________|___________| + */ + +/* + * Header of a disk block. More precisely, header of a formatted leaf + * or internal node, and not the header of an unformatted node. + */ +struct block_head { + __le16 blk_level; /* Level of a block in the tree. */ + __le16 blk_nr_item; /* Number of keys/items in a block. */ + __le16 blk_free_space; /* Block free space in bytes. */ + __le16 blk_reserved; + /* dump this in v4/planA */ + + /* kept only for compatibility */ + struct reiserfs_key blk_right_delim_key; +}; + +#define BLKH_SIZE (sizeof(struct block_head)) +#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level)) +#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item)) +#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space)) +#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved)) +#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val)) +#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val)) +#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val)) +#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val)) +#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key) +#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val) + +/* values for blk_level field of the struct block_head */ + +/* + * When node gets removed from the tree its blk_level is set to FREE_LEVEL. + * It is then used to see whether the node is still in the tree + */ +#define FREE_LEVEL 0 + +#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */ + +/* + * Given the buffer head of a formatted node, resolve to the + * block head of that node. + */ +#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data)) +/* Number of items that are in buffer. */ +#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh))) +#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh))) +#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh))) + +#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0) + +/* Get right delimiting key. -- little endian */ +#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh)))) + +/* Does the buffer contain a disk leaf. */ +#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL) + +/* Does the buffer contain a disk internal node */ +#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ + && B_LEVEL(bh) <= MAX_HEIGHT) + +/*************************************************************************** + * STAT DATA * + ***************************************************************************/ + +/* + * old stat data is 32 bytes long. We are going to distinguish new one by + * different size +*/ +struct stat_data_v1 { + __le16 sd_mode; /* file type, permissions */ + __le16 sd_nlink; /* number of hard links */ + __le16 sd_uid; /* owner */ + __le16 sd_gid; /* group */ + __le32 sd_size; /* file size */ + __le32 sd_atime; /* time of last access */ + __le32 sd_mtime; /* time file was last modified */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; + union { + __le32 sd_rdev; + __le32 sd_blocks; /* number of blocks file uses */ + } __attribute__ ((__packed__)) u; + + /* + * first byte of file which is stored in a direct item: except that if + * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no + * direct item. The existence of this field really grates on me. + * Let's replace it with a macro based on sd_size and our tail + * suppression policy. Someday. -Hans + */ + __le32 sd_first_direct_byte; +} __attribute__ ((__packed__)); + +#define SD_V1_SIZE (sizeof(struct stat_data_v1)) +#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5) +#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) +#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink)) +#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v)) +#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid)) +#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v)) +#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid)) +#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v)) +#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size)) +#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v)) +#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks)) +#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v)) +#define sd_v1_first_direct_byte(sdp) \ + (le32_to_cpu((sdp)->sd_first_direct_byte)) +#define set_sd_v1_first_direct_byte(sdp,v) \ + ((sdp)->sd_first_direct_byte = cpu_to_le32(v)) + +/* inode flags stored in sd_attrs (nee sd_reserved) */ + +/* + * we want common flags to have the same values as in ext2, + * so chattr(1) will work without problems + */ +#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL +#define REISERFS_APPEND_FL FS_APPEND_FL +#define REISERFS_SYNC_FL FS_SYNC_FL +#define REISERFS_NOATIME_FL FS_NOATIME_FL +#define REISERFS_NODUMP_FL FS_NODUMP_FL +#define REISERFS_SECRM_FL FS_SECRM_FL +#define REISERFS_UNRM_FL FS_UNRM_FL +#define REISERFS_COMPR_FL FS_COMPR_FL +#define REISERFS_NOTAIL_FL FS_NOTAIL_FL + +/* persistent flags that file inherits from the parent directory */ +#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ + REISERFS_SYNC_FL | \ + REISERFS_NOATIME_FL | \ + REISERFS_NODUMP_FL | \ + REISERFS_SECRM_FL | \ + REISERFS_COMPR_FL | \ + REISERFS_NOTAIL_FL ) + +/* + * Stat Data on disk (reiserfs version of UFS disk inode minus the + * address blocks) + */ +struct stat_data { + __le16 sd_mode; /* file type, permissions */ + __le16 sd_attrs; /* persistent inode flags */ + __le32 sd_nlink; /* number of hard links */ + __le64 sd_size; /* file size */ + __le32 sd_uid; /* owner */ + __le32 sd_gid; /* group */ + __le32 sd_atime; /* time of last access */ + __le32 sd_mtime; /* time file was last modified */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; + __le32 sd_blocks; + union { + __le32 sd_rdev; + __le32 sd_generation; + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + +/* this is 44 bytes long */ +#define SD_SIZE (sizeof(struct stat_data)) +#define SD_V2_SIZE SD_SIZE +#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6) +#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) +/* sd_reserved */ +/* set_sd_reserved */ +#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink)) +#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v)) +#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size)) +#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v)) +#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid)) +#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v)) +#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid)) +#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v)) +#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks)) +#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v)) +#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation)) +#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v)) +#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs)) +#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v)) + +/*************************************************************************** + * DIRECTORY STRUCTURE * + ***************************************************************************/ +/* + * Picture represents the structure of directory items + * ________________________________________________ + * | Array of | | | | | | + * | directory |N-1| N-2 | .... | 1st |0th| + * | entry headers | | | | | | + * |_______________|___|_____|________|_______|___| + * <---- directory entries ------> + * + * First directory item has k_offset component 1. We store "." and ".." + * in one item, always, we never split "." and ".." into differing + * items. This makes, among other things, the code for removing + * directories simpler. + */ +#define SD_OFFSET 0 +#define SD_UNIQUENESS 0 +#define DOT_OFFSET 1 +#define DOT_DOT_OFFSET 2 +#define DIRENTRY_UNIQUENESS 500 + +#define FIRST_ITEM_OFFSET 1 + +/* + * Q: How to get key of object pointed to by entry from entry? + * + * A: Each directory entry has its header. This header has deh_dir_id + * and deh_objectid fields, those are key of object, entry points to + */ + +/* + * NOT IMPLEMENTED: + * Directory will someday contain stat data of object + */ + +struct reiserfs_de_head { + __le32 deh_offset; /* third component of the directory entry key */ + + /* + * objectid of the parent directory of the object, that is referenced + * by directory entry + */ + __le32 deh_dir_id; + + /* objectid of the object, that is referenced by directory entry */ + __le32 deh_objectid; + __le16 deh_location; /* offset of name in the whole item */ + + /* + * whether 1) entry contains stat data (for future), and + * 2) whether entry is hidden (unlinked) + */ + __le16 deh_state; +} __attribute__ ((__packed__)); +#define DEH_SIZE sizeof(struct reiserfs_de_head) +#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset)) +#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id)) +#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid)) +#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location)) +#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state)) + +#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v))) +#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v))) +#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v))) +#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v))) +#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v))) + +/* empty directory contains two entries "." and ".." and their headers */ +#define EMPTY_DIR_SIZE \ +(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) + +/* old format directories have this size when empty */ +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) + +#define DEH_Statdata 0 /* not used now */ +#define DEH_Visible 2 + +/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */ +#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__) +# define ADDR_UNALIGNED_BITS (3) +#endif + +/* + * These are only used to manipulate deh_state. + * Because of this, we'll use the ext2_ bit routines, + * since they are little endian + */ +#ifdef ADDR_UNALIGNED_BITS + +# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) +# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3) + +# define set_bit_unaligned(nr, addr) \ + __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +# define clear_bit_unaligned(nr, addr) \ + __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +# define test_bit_unaligned(nr, addr) \ + test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) + +#else + +# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr) +# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr) +# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr) + +#endif + +#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid); +extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid); + +/* two entries per block (at least) */ +#define REISERFS_MAX_NAME(block_size) 255 + +/* + * this structure is used for operations on directory entries. It is + * not a disk structure. + * + * When reiserfs_find_entry or search_by_entry_key find directory + * entry, they return filled reiserfs_dir_entry structure + */ +struct reiserfs_dir_entry { + struct buffer_head *de_bh; + int de_item_num; + struct item_head *de_ih; + int de_entry_num; + struct reiserfs_de_head *de_deh; + int de_entrylen; + int de_namelen; + char *de_name; + unsigned long *de_gen_number_bit_string; + + __u32 de_dir_id; + __u32 de_objectid; + + struct cpu_key de_entry_key; +}; + +/* + * these defines are useful when a particular member of + * a reiserfs_dir_entry is needed + */ + +/* pointer to file name, stored in entry */ +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \ + (ih_item_body(bh, ih) + deh_location(deh)) + +/* length of name */ +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0)) + +/* hash value occupies bits from 7 up to 30 */ +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL) +/* generation number occupies 7 bits starting from 0 up to 6 */ +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL) +#define MAX_GENERATION_NUMBER 127 + +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number)) + +/* + * Picture represents an internal node of the reiserfs tree + * ______________________________________________________ + * | | Array of | Array of | Free | + * |block | keys | pointers | space | + * | head | N | N+1 | | + * |______|_______________|___________________|___________| + */ + +/*************************************************************************** + * DISK CHILD * + ***************************************************************************/ +/* + * Disk child pointer: + * The pointer from an internal node of the tree to a node that is on disk. + */ +struct disk_child { + __le32 dc_block_number; /* Disk child's block number. */ + __le16 dc_size; /* Disk child's used space. */ + __le16 dc_reserved; +}; + +#define DC_SIZE (sizeof(struct disk_child)) +#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number)) +#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size)) +#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0) +#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0) + +/* Get disk child by buffer header and position in the tree node. */ +#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\ +((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos))) + +/* Get disk child number by buffer header and position in the tree node. */ +#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos))) +#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \ + (put_dc_block_number(B_N_CHILD(bh, n_pos), val)) + + /* maximal value of field child_size in structure disk_child */ + /* child size is the combined size of all items and their headers */ +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE )) + +/* amount of used space in buffer (not including block head) */ +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur))) + +/* max and min number of keys in internal node */ +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) +#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) + +/*************************************************************************** + * PATH STRUCTURES AND DEFINES * + ***************************************************************************/ + +/* + * search_by_key fills up the path from the root to the leaf as it descends + * the tree looking for the key. It uses reiserfs_bread to try to find + * buffers in the cache given their block number. If it does not find + * them in the cache it reads them from disk. For each node search_by_key + * finds using reiserfs_bread it then uses bin_search to look through that + * node. bin_search will find the position of the block_number of the next + * node if it is looking through an internal node. If it is looking through + * a leaf node bin_search will find the position of the item which has key + * either equal to given key, or which is the maximal key less than the + * given key. + */ + +struct path_element { + /* Pointer to the buffer at the path in the tree. */ + struct buffer_head *pe_buffer; + /* Position in the tree node which is placed in the buffer above. */ + int pe_position; +}; + +/* + * maximal height of a tree. don't change this without + * changing JOURNAL_PER_BALANCE_CNT + */ +#define MAX_HEIGHT 5 + +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define EXTENDED_MAX_HEIGHT 7 + +/* Must be equal to at least 2. */ +#define FIRST_PATH_ELEMENT_OFFSET 2 + +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define ILLEGAL_PATH_ELEMENT_OFFSET 1 + +/* this MUST be MAX_HEIGHT + 1. See about FEB below */ +#define MAX_FEB_SIZE 6 + +/* + * We need to keep track of who the ancestors of nodes are. When we + * perform a search we record which nodes were visited while + * descending the tree looking for the node we searched for. This list + * of nodes is called the path. This information is used while + * performing balancing. Note that this path information may become + * invalid, and this means we must check it when using it to see if it + * is still valid. You'll need to read search_by_key and the comments + * in it, especially about decrement_counters_in_path(), to understand + * this structure. + * + * Paths make the code so much harder to work with and debug.... An + * enormous number of bugs are due to them, and trying to write or modify + * code that uses them just makes my head hurt. They are based on an + * excessive effort to avoid disturbing the precious VFS code.:-( The + * gods only know how we are going to SMP the code that uses them. + * znodes are the way! + */ + +#define PATH_READA 0x1 /* do read ahead */ +#define PATH_READA_BACK 0x2 /* read backwards */ + +struct treepath { + int path_length; /* Length of the array above. */ + int reada; + /* Array of the path elements. */ + struct path_element path_elements[EXTENDED_MAX_HEIGHT]; + int pos_in_item; +}; + +#define pos_in_item(path) ((path)->pos_in_item) + +#define INITIALIZE_PATH(var) \ +struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} + +/* Get path element by path and path position. */ +#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset)) + +/* Get buffer header at the path by path and path position. */ +#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer) + +/* Get position in the element at the path by path and path position. */ +#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) + +#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) + +/* + * you know, to the person who didn't write this the macro name does not + * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or + * maybe we should just focus on dumping paths... -Hans + */ +#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) + +/* + * in do_balance leaf has h == 0 in contrast with path structure, + * where root has level == 0. That is why we need these defines + */ + +/* tb->S[h] */ +#define PATH_H_PBUFFER(path, h) \ + PATH_OFFSET_PBUFFER(path, path->path_length - (h)) + +/* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1) + +#define PATH_H_POSITION(path, h) \ + PATH_OFFSET_POSITION(path, path->path_length - (h)) + +/* tb->S[h]->b_item_order */ +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) + +#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) + +static inline void *reiserfs_node_data(const struct buffer_head *bh) +{ + return bh->b_data + sizeof(struct block_head); +} + +/* get key from internal node */ +static inline struct reiserfs_key *internal_key(struct buffer_head *bh, + int item_num) +{ + struct reiserfs_key *key = reiserfs_node_data(bh); + + return &key[item_num]; +} + +/* get the item header from leaf node */ +static inline struct item_head *item_head(const struct buffer_head *bh, + int item_num) +{ + struct item_head *ih = reiserfs_node_data(bh); + + return &ih[item_num]; +} + +/* get the key from leaf node */ +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh, + int item_num) +{ + return &item_head(bh, item_num)->ih_key; +} + +static inline void *ih_item_body(const struct buffer_head *bh, + const struct item_head *ih) +{ + return bh->b_data + ih_location(ih); +} + +/* get item body from leaf node */ +static inline void *item_body(const struct buffer_head *bh, int item_num) +{ + return ih_item_body(bh, item_head(bh, item_num)); +} + +static inline struct item_head *tp_item_head(const struct treepath *path) +{ + return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +static inline void *tp_item_body(const struct treepath *path) +{ + return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +#define get_last_bh(path) PATH_PLAST_BUFFER(path) +#define get_item_pos(path) PATH_LAST_POSITION(path) +#define item_moved(ih,path) comp_items(ih, path) +#define path_changed(ih,path) comp_items (ih, path) + +/* array of the entry headers */ + /* get item body */ +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih))) + +/* + * length of the directory entry in directory item. This define + * calculates length of i-th directory entry using directory entry + * locations from dir entry head. When it calculates length of 0-th + * directory entry, it uses length of whole item in place of entry + * location of the non-existent following entry in the calculation. + * See picture above. + */ +static inline int entry_length(const struct buffer_head *bh, + const struct item_head *ih, int pos_in_item) +{ + struct reiserfs_de_head *deh; + + deh = B_I_DEH(bh, ih) + pos_in_item; + if (pos_in_item) + return deh_location(deh - 1) - deh_location(deh); + + return ih_item_len(ih) - deh_location(deh); +} + +/*************************************************************************** + * MISC * + ***************************************************************************/ + +/* Size of pointer to the unformatted node. */ +#define UNFM_P_SIZE (sizeof(unp_t)) +#define UNFM_P_SHIFT 2 + +/* in in-core inode key is stored on le form */ +#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) + +#define MAX_UL_INT 0xffffffff +#define MAX_INT 0x7ffffff +#define MAX_US_INT 0xffff + +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset +static inline loff_t max_reiserfs_offset(struct inode *inode) +{ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) + return (loff_t) U32_MAX; + + return (loff_t) ((~(__u64) 0) >> 4); +} + +#define MAX_KEY_OBJECTID MAX_UL_INT + +#define MAX_B_NUM MAX_UL_INT +#define MAX_FC_NUM MAX_US_INT + +/* the purpose is to detect overflow of an unsigned short */ +#define REISERFS_LINK_MAX (MAX_US_INT - 1000) + +/* + * The following defines are used in reiserfs_insert_item + * and reiserfs_append_item + */ +#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */ +#define REISERFS_USER_MEM 1 /* user memory mode */ + +#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) +#define get_generation(s) atomic_read (&fs_generation(s)) +#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s) \ +({ \ + reiserfs_cond_resched(s); \ + __fs_changed(gen, s); \ +}) + +/*************************************************************************** + * FIXATE NODES * + ***************************************************************************/ + +#define VI_TYPE_LEFT_MERGEABLE 1 +#define VI_TYPE_RIGHT_MERGEABLE 2 + +/* + * To make any changes in the tree we always first find node, that + * contains item to be changed/deleted or place to insert a new + * item. We call this node S. To do balancing we need to decide what + * we will shift to left/right neighbor, or to a new node, where new + * item will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ +struct virtual_item { + int vi_index; /* index in the array of item operations */ + unsigned short vi_type; /* left/right mergeability */ + + /* length of item that it will have after balancing */ + unsigned short vi_item_len; + + struct item_head *vi_ih; + const char *vi_item; /* body of item (old or new) */ + const void *vi_new_data; /* 0 always but paste mode */ + void *vi_uarea; /* item specific area */ +}; + +struct virtual_node { + /* this is a pointer to the free space in the buffer */ + char *vn_free_ptr; + + unsigned short vn_nr_item; /* number of items in virtual node */ + + /* + * size of node , that node would have if it has + * unlimited size and no balancing is performed + */ + short vn_size; + + /* mode of balancing (paste, insert, delete, cut) */ + short vn_mode; + + short vn_affected_item_num; + short vn_pos_in_item; + + /* item header of inserted item, 0 for other modes */ + struct item_head *vn_ins_ih; + const void *vn_data; + + /* array of items (including a new one, excluding item to be deleted) */ + struct virtual_item *vn_vi; +}; + +/* used by directory items when creating virtual nodes */ +struct direntry_uarea { + int flags; + __u16 entry_count; + __u16 entry_sizes[1]; +} __attribute__ ((__packed__)); + +/*************************************************************************** + * TREE BALANCE * + ***************************************************************************/ + +/* + * This temporary structure is used in tree balance algorithms, and + * constructed as we go to the extent that its various parts are + * needed. It contains arrays of nodes that can potentially be + * involved in the balancing of node S, and parameters that define how + * each of the nodes must be balanced. Note that in these algorithms + * for balancing the worst case is to need to balance the current node + * S and the left and right neighbors and all of their parents plus + * create a new node. We implement S1 balancing for the leaf nodes + * and S0 balancing for the internal nodes (S1 and S0 are defined in + * our papers.) + */ + +/* size of the array of buffers to free at end of do_balance */ +#define MAX_FREE_BLOCK 7 + +/* maximum number of FEB blocknrs on a single level */ +#define MAX_AMOUNT_NEEDED 2 + +/* someday somebody will prefix every field in this struct with tb_ */ +struct tree_balance { + int tb_mode; + int need_balance_dirty; + struct super_block *tb_sb; + struct reiserfs_transaction_handle *transaction_handle; + struct treepath *tb_path; + + /* array of left neighbors of nodes in the path */ + struct buffer_head *L[MAX_HEIGHT]; + + /* array of right neighbors of nodes in the path */ + struct buffer_head *R[MAX_HEIGHT]; + + /* array of fathers of the left neighbors */ + struct buffer_head *FL[MAX_HEIGHT]; + + /* array of fathers of the right neighbors */ + struct buffer_head *FR[MAX_HEIGHT]; + /* array of common parents of center node and its left neighbor */ + struct buffer_head *CFL[MAX_HEIGHT]; + + /* array of common parents of center node and its right neighbor */ + struct buffer_head *CFR[MAX_HEIGHT]; + + /* + * array of empty buffers. Number of buffers in array equals + * cur_blknum. + */ + struct buffer_head *FEB[MAX_FEB_SIZE]; + struct buffer_head *used[MAX_FEB_SIZE]; + struct buffer_head *thrown[MAX_FEB_SIZE]; + + /* + * array of number of items which must be shifted to the left in + * order to balance the current node; for leaves includes item that + * will be partially shifted; for internal nodes, it is the number + * of child pointers rather than items. It includes the new item + * being created. The code sometimes subtracts one to get the + * number of wholly shifted items for other purposes. + */ + int lnum[MAX_HEIGHT]; + + /* substitute right for left in comment above */ + int rnum[MAX_HEIGHT]; + + /* + * array indexed by height h mapping the key delimiting L[h] and + * S[h] to its item number within the node CFL[h] + */ + int lkey[MAX_HEIGHT]; + + /* substitute r for l in comment above */ + int rkey[MAX_HEIGHT]; + + /* + * the number of bytes by we are trying to add or remove from + * S[h]. A negative value means removing. + */ + int insert_size[MAX_HEIGHT]; + + /* + * number of nodes that will replace node S[h] after balancing + * on the level h of the tree. If 0 then S is being deleted, + * if 1 then S is remaining and no new nodes are being created, + * if 2 or 3 then 1 or 2 new nodes is being created + */ + int blknum[MAX_HEIGHT]; + + /* fields that are used only for balancing leaves of the tree */ + + /* number of empty blocks having been already allocated */ + int cur_blknum; + + /* number of items that fall into left most node when S[0] splits */ + int s0num; + + /* + * number of bytes which can flow to the left neighbor from the left + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int lbytes; + + /* + * number of bytes which will flow to the right neighbor from the right + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int rbytes; + + + /* + * index into the array of item headers in + * S[0] of the affected item + */ + int item_pos; + + /* new nodes allocated to hold what could not fit into S */ + struct buffer_head *S_new[2]; + + /* + * number of items that will be placed into nodes in S_new + * when S[0] splits + */ + int snum[2]; + + /* + * number of bytes which flow to nodes in S_new when S[0] splits + * note: if S[0] splits into 3 nodes, then items do not need to be cut + */ + int sbytes[2]; + + int pos_in_item; + int zeroes_num; + + /* + * buffers which are to be freed after do_balance finishes + * by unfix_nodes + */ + struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; + + /* + * kmalloced memory. Used to create virtual node and keep + * map of dirtied bitmap blocks + */ + char *vn_buf; + + int vn_buf_size; /* size of the vn_buf */ + + /* VN starts after bitmap of bitmap blocks */ + struct virtual_node *tb_vn; + + /* + * saved value of `reiserfs_generation' counter see + * FILESYSTEM_CHANGED() macro in reiserfs_fs.h + */ + int fs_gen; + +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + /* + * key pointer, to pass to block allocator or + * another low-level subsystem + */ + struct in_core_key key; +#endif +}; + +/* These are modes of balancing */ + +/* When inserting an item. */ +#define M_INSERT 'i' +/* + * When inserting into (directories only) or appending onto an already + * existent item. + */ +#define M_PASTE 'p' +/* When deleting an item. */ +#define M_DELETE 'd' +/* When truncating an item or removing an entry from a (directory) item. */ +#define M_CUT 'c' + +/* used when balancing on leaf level skipped (in reiserfsck) */ +#define M_INTERNAL 'n' + +/* + * When further balancing is not needed, then do_balance does not need + * to be called. + */ +#define M_SKIP_BALANCING 's' +#define M_CONVERT 'v' + +/* modes of leaf_move_items */ +#define LEAF_FROM_S_TO_L 0 +#define LEAF_FROM_S_TO_R 1 +#define LEAF_FROM_R_TO_L 2 +#define LEAF_FROM_L_TO_R 3 +#define LEAF_FROM_S_TO_SNEW 4 + +#define FIRST_TO_LAST 0 +#define LAST_TO_FIRST 1 + +/* + * used in do_balance for passing parent of node information that has + * been gotten from tb struct + */ +struct buffer_info { + struct tree_balance *tb; + struct buffer_head *bi_bh; + struct buffer_head *bi_parent; + int bi_position; +}; + +static inline struct super_block *sb_from_tb(struct tree_balance *tb) +{ + return tb ? tb->tb_sb : NULL; +} + +static inline struct super_block *sb_from_bi(struct buffer_info *bi) +{ + return bi ? sb_from_tb(bi->tb) : NULL; +} + +/* + * there are 4 types of items: stat data, directory item, indirect, direct. + * +-------------------+------------+--------------+------------+ + * | | k_offset | k_uniqueness | mergeable? | + * +-------------------+------------+--------------+------------+ + * | stat data | 0 | 0 | no | + * +-------------------+------------+--------------+------------+ + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no | + * | non 1st directory | hash value | UNIQUENESS | yes | + * | item | | | | + * +-------------------+------------+--------------+------------+ + * | indirect item | offset + 1 |TYPE_INDIRECT | [1] | + * +-------------------+------------+--------------+------------+ + * | direct item | offset + 1 |TYPE_DIRECT | [2] | + * +-------------------+------------+--------------+------------+ + * + * [1] if this is not the first indirect item of the object + * [2] if this is not the first direct item of the object +*/ + +struct item_operations { + int (*bytes_number) (struct item_head * ih, int block_size); + void (*decrement_key) (struct cpu_key *); + int (*is_left_mergeable) (struct reiserfs_key * ih, + unsigned long bsize); + void (*print_item) (struct item_head *, char *item); + void (*check_item) (struct item_head *, char *item); + + int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, + int is_affected, int insert_size); + int (*check_left) (struct virtual_item * vi, int free, + int start_skip, int end_skip); + int (*check_right) (struct virtual_item * vi, int free); + int (*part_size) (struct virtual_item * vi, int from, int to); + int (*unit_num) (struct virtual_item * vi); + void (*print_vi) (struct virtual_item * vi); +}; + +extern struct item_operations *item_ops[TYPE_ANY + 1]; + +#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) +#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) +#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item) +#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item) +#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size) +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip) +#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free) +#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to) +#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi) +#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi) + +#define COMP_SHORT_KEYS comp_short_keys + +/* number of blocks pointed to by the indirect item */ +#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE) + +/* + * the used space within the unformatted node corresponding + * to pos within the item pointed to by ih + */ +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) + +/* + * number of bytes contained by the direct item or the + * unformatted nodes the indirect item points to + */ + +/* following defines use reiserfs buffer header and item header */ + +/* get stat-data */ +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) + +/* this is 3976 for size==4096 */ +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) + +/* + * indirect items consist of entries which contain blocknrs, pos + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the + * blocknr contained by the entry pos points to + */ +#define B_I_POS_UNFM_POINTER(bh, ih, pos) \ + le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \ + (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val)) + +struct reiserfs_iget_args { + __u32 objectid; + __u32 dirid; +}; + +/*************************************************************************** + * FUNCTION DECLARATIONS * + ***************************************************************************/ + +#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) + +#define journal_trans_half(blocksize) \ + ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + +/* journal.c see journal.c for all the comments here */ + +/* first block written in a commit. */ +struct reiserfs_journal_desc { + __le32 j_trans_id; /* id of commit */ + + /* length of commit. len +1 is the commit block */ + __le32 j_len; + + __le32 j_mount_id; /* mount id of this trans */ + __le32 j_realblock[1]; /* real locations for each block */ +}; + +#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) +#define get_desc_trans_len(d) le32_to_cpu((d)->j_len) +#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id) + +#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0) +#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0) + +/* last block written in a commit */ +struct reiserfs_journal_commit { + __le32 j_trans_id; /* must match j_trans_id from the desc block */ + __le32 j_len; /* ditto */ + __le32 j_realblock[1]; /* real locations for each block */ +}; + +#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) +#define get_commit_trans_len(c) le32_to_cpu((c)->j_len) +#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id) + +#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0) + +/* + * this header block gets written whenever a transaction is considered + * fully flushed, and is more recent than the last fully flushed transaction. + * fully flushed means all the log blocks and all the real blocks are on + * disk, and this transaction does not need to be replayed. + */ +struct reiserfs_journal_header { + /* id of last fully flushed transaction */ + __le32 j_last_flush_trans_id; + + /* offset in the log of where to start replay after a crash */ + __le32 j_first_unflushed_offset; + + __le32 j_mount_id; + /* 12 */ struct journal_params jh_journal; +}; + +/* biggest tunable defines are right here */ +#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ + +/* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_TRANS_MAX_DEFAULT 1024 +#define JOURNAL_TRANS_MIN_DEFAULT 256 + +/* + * max blocks to batch into one transaction, + * don't make this any bigger than 900 + */ +#define JOURNAL_MAX_BATCH_DEFAULT 900 +#define JOURNAL_MIN_RATIO 2 +#define JOURNAL_MAX_COMMIT_AGE 30 +#define JOURNAL_MAX_TRANS_AGE 30 +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9) +#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \ + 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \ + REISERFS_QUOTA_TRANS_BLOCKS(sb))) + +#ifdef CONFIG_QUOTA +#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA)) +/* We need to update data and inode (atime) */ +#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0) +/* 1 balancing, 1 bitmap, 1 data per write + stat data update */ +#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0) +/* same as with INIT */ +#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0) +#else +#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0 +#define REISERFS_QUOTA_INIT_BLOCKS(s) 0 +#define REISERFS_QUOTA_DEL_BLOCKS(s) 0 +#endif + +/* + * both of these can be as low as 1, or as high as you want. The min is the + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated + * as needed, and released when transactions are committed. On release, if + * the current number of nodes is > max, the node is freed, otherwise, + * it is put on a free list for faster use later. +*/ +#define REISERFS_MIN_BITMAP_NODES 10 +#define REISERFS_MAX_BITMAP_NODES 100 + +/* these are based on journal hash size of 8192 */ +#define JBH_HASH_SHIFT 13 +#define JBH_HASH_MASK 8191 + +#define _jhashfn(sb,block) \ + (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \ + (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) +#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) + +/* We need these to make journal.c code more readable */ +#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) + +enum reiserfs_bh_state_bits { + BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ + BH_JDirty_wait, + /* + * disk block was taken off free list before being in a + * finished transaction, or written to disk. Can be reused immed. + */ + BH_JNew, + BH_JPrepared, + BH_JRestore_dirty, + BH_JTest, /* debugging only will go away */ +}; + +BUFFER_FNS(JDirty, journaled); +TAS_BUFFER_FNS(JDirty, journaled); +BUFFER_FNS(JDirty_wait, journal_dirty); +TAS_BUFFER_FNS(JDirty_wait, journal_dirty); +BUFFER_FNS(JNew, journal_new); +TAS_BUFFER_FNS(JNew, journal_new); +BUFFER_FNS(JPrepared, journal_prepared); +TAS_BUFFER_FNS(JPrepared, journal_prepared); +BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +BUFFER_FNS(JTest, journal_test); +TAS_BUFFER_FNS(JTest, journal_test); + +/* transaction handle which is passed around for all journal calls */ +struct reiserfs_transaction_handle { + /* + * super for this FS when journal_begin was called. saves calls to + * reiserfs_get_super also used by nested transactions to make + * sure they are nesting on the right FS _must_ be first + * in the handle + */ + struct super_block *t_super; + + int t_refcount; + int t_blocks_logged; /* number of blocks this writer has logged */ + int t_blocks_allocated; /* number of blocks this writer allocated */ + + /* sanity check, equals the current trans id */ + unsigned int t_trans_id; + + void *t_handle_save; /* save existing current->journal_info */ + + /* + * if new block allocation occurres, that block + * should be displaced from others + */ + unsigned displace_new_blocks:1; + + struct list_head t_list; +}; + +/* + * used to keep track of ordered and tail writes, attached to the buffer + * head through b_journal_head. + */ +struct reiserfs_jh { + struct reiserfs_journal_list *jl; + struct buffer_head *bh; + struct list_head list; +}; + +void reiserfs_free_jh(struct buffer_head *bh); +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); +int journal_mark_dirty(struct reiserfs_transaction_handle *, + struct buffer_head *bh); + +static inline int reiserfs_file_data_log(struct inode *inode) +{ + if (reiserfs_data_log(inode->i_sb) || + (REISERFS_I(inode)->i_flags & i_data_log)) + return 1; + return 0; +} + +static inline int reiserfs_transaction_running(struct super_block *s) +{ + struct reiserfs_transaction_handle *th = current->journal_info; + if (th && th->t_super == s) + return 1; + if (th && th->t_super == NULL) + BUG(); + return 0; +} + +static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th) +{ + return th->t_blocks_allocated - th->t_blocks_logged; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *, + int count); +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +void reiserfs_vfs_truncate_file(struct inode *inode); +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to); +void reiserfs_flush_old_commits(struct super_block *); +int reiserfs_commit_for_inode(struct inode *); +int reiserfs_inode_needs_commit(struct inode *); +void reiserfs_update_inode_transaction(struct inode *); +void reiserfs_wait_on_write_block(struct super_block *s); +void reiserfs_block_writes(struct reiserfs_transaction_handle *th); +void reiserfs_allow_writes(struct super_block *s); +void reiserfs_check_lock_depth(struct super_block *s, char *caller); +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, + int wait); +void reiserfs_restore_prepared_buffer(struct super_block *, + struct buffer_head *bh); +int journal_init(struct super_block *, const char *j_dev_name, int old_format, + unsigned int); +int journal_release(struct reiserfs_transaction_handle *, struct super_block *); +int journal_release_error(struct reiserfs_transaction_handle *, + struct super_block *); +int journal_end(struct reiserfs_transaction_handle *); +int journal_end_sync(struct reiserfs_transaction_handle *); +int journal_mark_freed(struct reiserfs_transaction_handle *, + struct super_block *, b_blocknr_t blocknr); +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); +int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, + int bit_nr, int searchall, b_blocknr_t *next); +int journal_begin(struct reiserfs_transaction_handle *, + struct super_block *sb, unsigned long); +int journal_join_abort(struct reiserfs_transaction_handle *, + struct super_block *sb); +void reiserfs_abort_journal(struct super_block *sb, int errno); +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); +int reiserfs_allocate_list_bitmaps(struct super_block *s, + struct reiserfs_list_bitmap *, unsigned int); + +void reiserfs_schedule_old_flush(struct super_block *s); +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate); +int remove_save_link(struct inode *inode, int truncate); + +/* objectid.c */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th); +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release); +int reiserfs_convert_objectid_map_v1(struct super_block *); + +/* stree.c */ +int B_IS_IN_TREE(const struct buffer_head *); +extern void copy_item_head(struct item_head *to, + const struct item_head *from); + +/* first key is in cpu form, second - le */ +extern int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key); +extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); + +/* both are in le form */ +extern int comp_le_keys(const struct reiserfs_key *, + const struct reiserfs_key *); +extern int comp_short_le_keys(const struct reiserfs_key *, + const struct reiserfs_key *); + +/* * get key version from on disk key - kludge */ +static inline int le_key_version(const struct reiserfs_key *key) +{ + int type; + + type = offset_v2_k_type(&(key->u.k_offset_v2)); + if (type != TYPE_DIRECT && type != TYPE_INDIRECT + && type != TYPE_DIRENTRY) + return KEY_FORMAT_3_5; + + return KEY_FORMAT_3_6; + +} + +static inline void copy_key(struct reiserfs_key *to, + const struct reiserfs_key *from) +{ + memcpy(to, from, KEY_SIZE); +} + +int comp_items(const struct item_head *stored_ih, const struct treepath *path); +const struct reiserfs_key *get_rkey(const struct treepath *chk_path, + const struct super_block *sb); +int search_by_key(struct super_block *, const struct cpu_key *, + struct treepath *, int); +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL) +int search_for_position_by_key(struct super_block *sb, + const struct cpu_key *cpu_key, + struct treepath *search_path); +extern void decrement_bcount(struct buffer_head *bh); +void decrement_counters_in_path(struct treepath *search_path); +void pathrelse(struct treepath *search_path); +int reiserfs_check_path(struct treepath *p); +void pathrelse_and_restore(struct super_block *s, struct treepath *search_path); + +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct item_head *ih, + struct inode *inode, const char *body); + +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct inode *inode, + const char *body, int paste_size); + +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + struct cpu_key *key, + struct inode *inode, + struct page *page, loff_t new_file_size); + +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + const struct cpu_key *key, + struct inode *inode, struct buffer_head *un_bh); + +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key); +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode); +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, + struct inode *inode, struct page *, + int update_timestamps); + +#define i_block_size(inode) ((inode)->i_sb->s_blocksize) +#define file_size(inode) ((inode)->i_size) +#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1)) + +#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\ +!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 ) + +void padd_item(char *item, int total_length, int length); + +/* inode.c */ +/* args for the create parameter of reiserfs_get_block */ +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ +#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ + +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args); +int reiserfs_find_actor(struct inode *inode, void *p); +int reiserfs_init_locked_inode(struct inode *inode, void *p); +void reiserfs_evict_inode(struct inode *inode); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent); + +int reiserfs_truncate_file(struct inode *, int update_timestamps); +void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, + int type, int key_length); +void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, int entry_count); +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key); + +struct reiserfs_security_handle; +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, umode_t mode, + const char *symname, loff_t i_size, + struct dentry *dentry, struct inode *inode, + struct reiserfs_security_handle *security); + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size); + +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + reiserfs_update_sd_size(th, inode, inode->i_size); +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + +/* namei.c */ +void set_de_name_and_namelen(struct reiserfs_dir_entry *de); +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *path, struct reiserfs_dir_entry *de); +struct dentry *reiserfs_get_parent(struct dentry *); + +#ifdef CONFIG_REISERFS_PROC_INFO +int reiserfs_proc_info_init(struct super_block *sb); +int reiserfs_proc_info_done(struct super_block *sb); +int reiserfs_proc_info_global_init(void); +int reiserfs_proc_info_global_done(void); + +#define PROC_EXP( e ) e + +#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data +#define PROC_INFO_MAX( sb, field, value ) \ + __PINFO( sb ).field = \ + max( REISERFS_SB( sb ) -> s_proc_info_data.field, value ) +#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) ) +#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) ) +#define PROC_INFO_BH_STAT( sb, bh, level ) \ + PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \ + PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \ + PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) ) +#else +static inline int reiserfs_proc_info_init(struct super_block *sb) +{ + return 0; +} + +static inline int reiserfs_proc_info_done(struct super_block *sb) +{ + return 0; +} + +static inline int reiserfs_proc_info_global_init(void) +{ + return 0; +} + +static inline int reiserfs_proc_info_global_done(void) +{ + return 0; +} + +#define PROC_EXP( e ) +#define VOID_V ( ( void ) 0 ) +#define PROC_INFO_MAX( sb, field, value ) VOID_V +#define PROC_INFO_INC( sb, field ) VOID_V +#define PROC_INFO_ADD( sb, field, val ) VOID_V +#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V +#endif + +/* dir.c */ +extern const struct inode_operations reiserfs_dir_inode_operations; +extern const struct inode_operations reiserfs_symlink_inode_operations; +extern const struct inode_operations reiserfs_special_inode_operations; +extern const struct file_operations reiserfs_dir_operations; +int reiserfs_readdir_inode(struct inode *, struct dir_context *); + +/* tail_conversion.c */ +int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, + struct treepath *, struct buffer_head *, loff_t); +int indirect2direct(struct reiserfs_transaction_handle *, struct inode *, + struct page *, struct treepath *, const struct cpu_key *, + loff_t, char *); +void reiserfs_unmap_buffer(struct buffer_head *); + +/* file.c */ +extern const struct inode_operations reiserfs_file_inode_operations; +extern const struct file_operations reiserfs_file_operations; +extern const struct address_space_operations reiserfs_address_space_operations; + +/* fix_nodes.c */ + +int fix_nodes(int n_op_mode, struct tree_balance *tb, + struct item_head *ins_ih, const void *); +void unfix_nodes(struct tree_balance *); + +/* prints.c */ +void __reiserfs_panic(struct super_block *s, const char *id, + const char *function, const char *fmt, ...) + __attribute__ ((noreturn)); +#define reiserfs_panic(s, id, fmt, args...) \ + __reiserfs_panic(s, id, __func__, fmt, ##args) +void __reiserfs_error(struct super_block *s, const char *id, + const char *function, const char *fmt, ...); +#define reiserfs_error(s, id, fmt, args...) \ + __reiserfs_error(s, id, __func__, fmt, ##args) +void reiserfs_info(struct super_block *s, const char *fmt, ...); +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...); +void print_indirect_item(struct buffer_head *bh, int item_num); +void store_print_tb(struct tree_balance *tb); +void print_cur_tb(char *mes); +void print_de(struct reiserfs_dir_entry *de); +void print_bi(struct buffer_info *bi, char *mes); +#define PRINT_LEAF_ITEMS 1 /* print all items */ +#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */ +#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */ +void print_block(struct buffer_head *bh, ...); +void print_bmap(struct super_block *s, int silent); +void print_bmap_block(int i, char *data, int size, int silent); +/*void print_super_block (struct super_block * s, char * mes);*/ +void print_objectid_map(struct super_block *s); +void print_block_head(struct buffer_head *bh, char *mes); +void check_leaf(struct buffer_head *bh); +void check_internal(struct buffer_head *bh); +void print_statistics(struct super_block *s); +char *reiserfs_hashname(int code); + +/* lbalance.c */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew); +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes); +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, + int del_num, int del_bytes); +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head * const inserted_item_ih, + const char * const inserted_item_body, + int zeros_number); +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, + int pos_in_item, int paste_size, + const char * const body, int zeros_number); +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size); +void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, + int new_entry_count, struct reiserfs_de_head *new_dehs, + const char *records, int paste_size); +/* ibalance.c */ +int balance_internal(struct tree_balance *, int, int, struct item_head *, + struct buffer_head **); + +/* do_balance.c */ +void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag); +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag); +void reiserfs_invalidate_buffer(struct tree_balance *tb, + struct buffer_head *bh); + +int get_left_neighbor_position(struct tree_balance *tb, int h); +int get_right_neighbor_position(struct tree_balance *tb, int h); +void replace_key(struct tree_balance *tb, struct buffer_head *, int, + struct buffer_head *, int); +void make_empty_node(struct buffer_info *); +struct buffer_head *get_FEB(struct tree_balance *); + +/* bitmap.c */ + +/* + * structure contains hints for block allocator, and it is a container for + * arguments, such as node, search path, transaction_handle, etc. + */ +struct __reiserfs_blocknr_hint { + /* inode passed to allocator, if we allocate unf. nodes */ + struct inode *inode; + + sector_t block; /* file offset, in blocks */ + struct in_core_key key; + + /* + * search path, used by allocator to deternine search_start by + * various ways + */ + struct treepath *path; + + /* + * transaction handle is needed to log super blocks + * and bitmap blocks changes + */ + struct reiserfs_transaction_handle *th; + + b_blocknr_t beg, end; + + /* + * a field used to transfer search start value (block number) + * between different block allocator procedures + * (determine_search_start() and others) + */ + b_blocknr_t search_start; + + /* + * is set in determine_prealloc_size() function, + * used by underlayed function that do actual allocation + */ + int prealloc_size; + + /* + * the allocator uses different polices for getting disk + * space for formatted/unformatted blocks with/without preallocation + */ + unsigned formatted_node:1; + unsigned preallocate:1; +}; + +typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t; + +int reiserfs_parse_alloc_options(struct super_block *, char *); +void reiserfs_init_alloc_options(struct super_block *s); + +/* + * given a directory, this will tell you what packing locality + * to use for a new object underneat it. The locality is returned + * in disk byte order (le). + */ +__le32 reiserfs_choose_packing(struct inode *dir); + +void show_alloc_options(struct seq_file *seq, struct super_block *s); +int reiserfs_init_bitmap_cache(struct super_block *sb); +void reiserfs_free_bitmap_cache(struct super_block *sb); +void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap); +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *, + b_blocknr_t, int for_unformatted); +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int, + int); +static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb, + b_blocknr_t * new_blocknrs, + int amount_needed) +{ + reiserfs_blocknr_hint_t hint = { + .th = tb->transaction_handle, + .path = tb->tb_path, + .inode = NULL, + .key = tb->key, + .block = 0, + .formatted_node = 1 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed, + 0); +} + +static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle + *th, struct inode *inode, + b_blocknr_t * new_blocknrs, + struct treepath *path, + sector_t block) +{ + reiserfs_blocknr_hint_t hint = { + .th = th, + .path = path, + .inode = inode, + .block = block, + .formatted_node = 0, + .preallocate = 0 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +#ifdef REISERFS_PREALLOCATE +static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle + *th, struct inode *inode, + b_blocknr_t * new_blocknrs, + struct treepath *path, + sector_t block) +{ + reiserfs_blocknr_hint_t hint = { + .th = th, + .path = path, + .inode = inode, + .block = block, + .formatted_node = 0, + .preallocate = 1 + }; + return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode); +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th); +#endif + +/* hashes.c */ +__u32 keyed_hash(const signed char *msg, int len); +__u32 yura_hash(const signed char *msg, int len); +__u32 r5_hash(const signed char *msg, int len); + +#define reiserfs_set_le_bit __set_bit_le +#define reiserfs_test_and_set_le_bit __test_and_set_bit_le +#define reiserfs_clear_le_bit __clear_bit_le +#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le +#define reiserfs_test_le_bit test_bit_le +#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le + +/* + * sometimes reiserfs_truncate may require to allocate few new blocks + * to perform indirect2direct conversion. People probably used to + * think, that truncate should work without problems on a filesystem + * without free disk space. They may complain that they can not + * truncate due to lack of free disk space. This spare space allows us + * to not worry about it. 500 is probably too much, but it should be + * absolutely safe + */ +#define SPARE_SPACE 500 + +/* prototypes from ioctl.c */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long reiserfs_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg); +int reiserfs_unpack(struct inode *inode, struct file *filp); diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c new file mode 100644 index 000000000..6052d323b --- /dev/null +++ b/fs/reiserfs/resize.c @@ -0,0 +1,229 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Alexander Zarochentcev. + * + * The kernel part of the (on-line) reiserfs resizer. + */ + +#include +#include +#include +#include +#include +#include "reiserfs.h" +#include + +int reiserfs_resize(struct super_block *s, unsigned long block_count_new) +{ + int err = 0; + struct reiserfs_super_block *sb; + struct reiserfs_bitmap_info *bitmap; + struct reiserfs_bitmap_info *info; + struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); + struct buffer_head *bh; + struct reiserfs_transaction_handle th; + unsigned int bmap_nr_new, bmap_nr; + unsigned int block_r_new, block_r; + + struct reiserfs_list_bitmap *jb; + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; + + unsigned long int block_count, free_blocks; + int i; + int copy_size; + int depth; + + sb = SB_DISK_SUPER_BLOCK(s); + + if (SB_BLOCK_COUNT(s) >= block_count_new) { + printk("can\'t shrink filesystem on-line\n"); + return -EINVAL; + } + + /* check the device size */ + depth = reiserfs_write_unlock_nested(s); + bh = sb_bread(s, block_count_new - 1); + reiserfs_write_lock_nested(s, depth); + if (!bh) { + printk("reiserfs_resize: can\'t read last block\n"); + return -EINVAL; + } + bforget(bh); + + /* + * old disk layout detection; those partitions can be mounted, but + * cannot be resized + */ + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES) { + printk + ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + return -ENOTSUPP; + } + + /* count used bits in last bitmap block */ + block_r = SB_BLOCK_COUNT(s) - + (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8; + + /* count bitmap blocks in new fs */ + bmap_nr_new = block_count_new / (s->s_blocksize * 8); + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; + if (block_r_new) + bmap_nr_new++; + else + block_r_new = s->s_blocksize * 8; + + /* save old values */ + block_count = SB_BLOCK_COUNT(s); + bmap_nr = reiserfs_bmap_count(s); + + /* resizing of reiserfs bitmaps (journal and real), if needed */ + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk + ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + return -ENOMEM; + } + /* + * the new journal bitmaps are zero filled, now we copy i + * the bitmap node pointers from the old journal bitmap + * structs, and then transfer the new data structures + * into the journal struct. + * + * using the copy_size var below allows this code to work for + * both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = + copy_size * sizeof(struct reiserfs_list_bitmap_node *); + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + struct reiserfs_bitmap_node **node_tmp; + jb = SB_JOURNAL(s)->j_list_bitmap + i; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); + + /* + * just in case vfree schedules on us, copy the new + * pointer into the journal struct before freeing the + * old one + */ + node_tmp = jb->bitmaps; + jb->bitmaps = jbitmap[i].bitmaps; + vfree(node_tmp); + } + + /* + * allocate additional bitmap blocks, reallocate + * array of bitmap block pointers + */ + bitmap = + vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + if (!bitmap) { + /* + * Journal bitmaps are still supersized, but the + * memory isn't leaked, so I guess it's ok + */ + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + for (i = 0; i < bmap_nr; i++) + bitmap[i] = old_bitmap[i]; + + /* + * This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the + * journal transaction begins, and the new bitmaps don't + * matter if the transaction fails. + */ + for (i = bmap_nr; i < bmap_nr_new; i++) { + int depth; + /* + * don't use read_bitmap_block since it will cache + * the uninitialized bitmap + */ + depth = reiserfs_write_unlock_nested(s); + bh = sb_bread(s, i * s->s_blocksize * 8); + reiserfs_write_lock_nested(s, depth); + if (!bh) { + vfree(bitmap); + return -EIO; + } + memset(bh->b_data, 0, sb_blocksize(sb)); + reiserfs_set_le_bit(0, bh->b_data); + reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + depth = reiserfs_write_unlock_nested(s); + sync_dirty_buffer(bh); + reiserfs_write_lock_nested(s, depth); + /* update bitmap_info stuff */ + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; + brelse(bh); + } + /* free old bitmap blocks array */ + SB_AP_BITMAP(s) = bitmap; + vfree(old_bitmap); + } + + /* + * begin transaction, if there was an error, it's fine. Yes, we have + * incorrect bitmaps now, but none of it is ever going to touch the + * disk anyway. + */ + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* Extend old last bitmap block - new blocks have been made available */ + info = SB_AP_BITMAP(s) + bmap_nr - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); + if (!bh) { + int jerr = journal_end(&th); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r; i < s->s_blocksize * 8; i++) + reiserfs_clear_le_bit(i, bh->b_data); + info->free_count += s->s_blocksize * 8 - block_r; + + journal_mark_dirty(&th, bh); + brelse(bh); + + /* Correct new last bitmap block - It may not be full */ + info = SB_AP_BITMAP(s) + bmap_nr_new - 1; + bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); + if (!bh) { + int jerr = journal_end(&th); + if (jerr) + return jerr; + return -EIO; + } + + reiserfs_prepare_for_journal(s, bh, 1); + for (i = block_r_new; i < s->s_blocksize * 8; i++) + reiserfs_set_le_bit(i, bh->b_data); + journal_mark_dirty(&th, bh); + brelse(bh); + + info->free_count -= s->s_blocksize * 8 - block_r_new; + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + free_blocks = SB_FREE_BLOCKS(s); + PUT_SB_FREE_BLOCKS(s, + free_blocks + (block_count_new - block_count - + (bmap_nr_new - bmap_nr))); + PUT_SB_BLOCK_COUNT(s, block_count_new); + PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); + + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + + SB_JOURNAL(s)->j_must_wait = 1; + return journal_end(&th); +} diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c new file mode 100644 index 000000000..24cbe0132 --- /dev/null +++ b/fs/reiserfs/stree.c @@ -0,0 +1,2262 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru + * Programm System Institute + * Pereslavl-Zalessky Russia + */ + +#include +#include +#include +#include "reiserfs.h" +#include +#include + +/* Does the buffer contain a disk block which is in the tree. */ +inline int B_IS_IN_TREE(const struct buffer_head *bh) +{ + + RFALSE(B_LEVEL(bh) > MAX_HEIGHT, + "PAP-1010: block (%b) has too big level (%z)", bh, bh); + + return (B_LEVEL(bh) != FREE_LEVEL); +} + +/* to get item head in le form */ +inline void copy_item_head(struct item_head *to, + const struct item_head *from) +{ + memcpy(to, from, IH_SIZE); +} + +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. For key of items of the same + * object this returns 0. + * Returns: -1 if key1 < key2 + * 0 if key1 == key2 + * 1 if key1 > key2 + */ +inline int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + __u32 n; + n = le32_to_cpu(le_key->k_dir_id); + if (n < cpu_key->on_disk_key.k_dir_id) + return -1; + if (n > cpu_key->on_disk_key.k_dir_id) + return 1; + n = le32_to_cpu(le_key->k_objectid); + if (n < cpu_key->on_disk_key.k_objectid) + return -1; + if (n > cpu_key->on_disk_key.k_objectid) + return 1; + return 0; +} + +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. + * Compare keys using all 4 key fields. + * Returns: -1 if key1 < key2 0 + * if key1 = key2 1 if key1 > key2 + */ +static inline int comp_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) +{ + int retval; + + retval = comp_short_keys(le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset(le_key_version(le_key), le_key) < + cpu_key_k_offset(cpu_key)) + return -1; + if (le_key_k_offset(le_key_version(le_key), le_key) > + cpu_key_k_offset(cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type(le_key_version(le_key), le_key) < + cpu_key_k_type(cpu_key)) + return -1; + + if (le_key_k_type(le_key_version(le_key), le_key) > + cpu_key_k_type(cpu_key)) + return 1; + + return 0; +} + +inline int comp_short_le_keys(const struct reiserfs_key *key1, + const struct reiserfs_key *key2) +{ + __u32 *k1_u32, *k2_u32; + int key_length = REISERFS_SHORT_KEY_LEN; + + k1_u32 = (__u32 *) key1; + k2_u32 = (__u32 *) key2; + for (; key_length--; ++k1_u32, ++k2_u32) { + if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32)) + return -1; + if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32)) + return 1; + } + return 0; +} + +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) +{ + int version; + to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); + + /* find out version of the key */ + version = le_key_version(from); + to->version = version; + to->on_disk_key.k_offset = le_key_k_offset(version, from); + to->on_disk_key.k_type = le_key_k_type(version, from); +} + +/* + * this does not say which one is bigger, it only returns 1 if keys + * are not equal, 0 otherwise + */ +inline int comp_le_keys(const struct reiserfs_key *k1, + const struct reiserfs_key *k2) +{ + return memcmp(k1, k2, sizeof(struct reiserfs_key)); +} + +/************************************************************************** + * Binary search toolkit function * + * Search for an item in the array by the item key * + * Returns: 1 if found, 0 if not found; * + * *pos = number of the searched element if found, else the * + * number of the first element that is larger than key. * + **************************************************************************/ +/* + * For those not familiar with binary search: lbound is the leftmost item + * that it could be, rbound the rightmost item that it could be. We examine + * the item halfway between lbound and rbound, and that tells us either + * that we can increase lbound, or decrease rbound, or that we have found it, + * or if lbound <= rbound that there are no possible items, and we have not + * found it. With each examination we cut the number of possible items it + * could be by one more than half rounded down, or we find it. + */ +static inline int bin_search(const void *key, /* Key to search for. */ + const void *base, /* First item in the array. */ + int num, /* Number of items in the array. */ + /* + * Item size in the array. searched. Lest the + * reader be confused, note that this is crafted + * as a general function, and when it is applied + * specifically to the array of item headers in a + * node, width is actually the item header size + * not the item size. + */ + int width, + int *pos /* Number of the searched for element. */ + ) +{ + int rbound, lbound, j; + + for (j = ((rbound = num - 1) + (lbound = 0)) / 2; + lbound <= rbound; j = (rbound + lbound) / 2) + switch (comp_keys + ((struct reiserfs_key *)((char *)base + j * width), + (struct cpu_key *)key)) { + case -1: + lbound = j + 1; + continue; + case 1: + rbound = j - 1; + continue; + case 0: + *pos = j; + return ITEM_FOUND; /* Key found in the array. */ + } + + /* + * bin_search did not find given key, it returns position of key, + * that is minimal and greater than the given one. + */ + *pos = lbound; + return ITEM_NOT_FOUND; +} + + +/* Minimal possible key. It is never in the tree. */ +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; + +/* Maximal possible key. It is never in the tree. */ +static const struct reiserfs_key MAX_KEY = { + cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff), + {{cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff)},} +}; + +/* + * Get delimiting key of the buffer by looking for it in the buffers in the + * path, starting from the bottom of the path, and going upwards. We must + * check the path's validity at each step. If the key is not in the path, + * there is no delimiting key in the tree (buffer is first or last buffer + * in tree), and in this case we return a special key, either MIN_KEY or + * MAX_KEY. + */ +static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5010: invalid offset in the path"); + + /* While not higher in path than first element. */ + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5020: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MAX_KEY; + /* + * Return delimiting key if position in the parent + * is not equal to zero. + */ + if (position) + return internal_key(parent, position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MIN_KEY; + return &MAX_KEY; +} + +/* Get delimiting key of the buffer at the path and its right neighbor. */ +inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, + const struct super_block *sb) +{ + int position, path_offset = chk_path->path_length; + struct buffer_head *parent; + + RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5030: invalid offset in the path"); + + while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(chk_path, path_offset)), + "PAP-5040: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (parent = + PATH_OFFSET_PBUFFER(chk_path, path_offset))) + return &MIN_KEY; + /* Check whether position in the parent is correct. */ + if ((position = + PATH_OFFSET_POSITION(chk_path, + path_offset)) > + B_NR_ITEMS(parent)) + return &MIN_KEY; + /* + * Check whether parent at the path really points + * to the child. + */ + if (B_N_CHILD_NUM(parent, position) != + PATH_OFFSET_PBUFFER(chk_path, + path_offset + 1)->b_blocknr) + return &MIN_KEY; + + /* + * Return delimiting key if position in the parent + * is not the last one. + */ + if (position != B_NR_ITEMS(parent)) + return internal_key(parent, position); + } + + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(sb)) + return &MAX_KEY; + return &MIN_KEY; +} + +/* + * Check whether a key is contained in the tree rooted from a buffer at a path. + * This works by looking at the left and right delimiting keys for the buffer + * in the last path_element in the path. These delimiting keys are stored + * at least one level above that buffer in the tree. If the buffer is the + * first or last node in the tree order then one of the delimiting keys may + * be absent, and in this case get_lkey and get_rkey return a special key + * which is MIN_KEY or MAX_KEY. + */ +static inline int key_in_buffer( + /* Path which should be checked. */ + struct treepath *chk_path, + /* Key which should be checked. */ + const struct cpu_key *key, + struct super_block *sb + ) +{ + + RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET + || chk_path->path_length > MAX_HEIGHT, + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", + key, chk_path->path_length); + RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev, + "PAP-5060: device must not be NODEV"); + + if (comp_keys(get_lkey(chk_path, sb), key) == 1) + /* left delimiting key is bigger, that the key we look for */ + return 0; + /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */ + if (comp_keys(get_rkey(chk_path, sb), key) != 1) + /* key must be less than right delimitiing key */ + return 0; + return 1; +} + +int reiserfs_check_path(struct treepath *p) +{ + RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, + "path not properly relsed"); + return 0; +} + +/* + * Drop the reference to each buffer in a path and restore + * dirty bits clean when preparing the buffer for the log. + * This version should only be called from fix_nodes() + */ +void pathrelse_and_restore(struct super_block *sb, + struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "clm-4000: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { + struct buffer_head *bh; + bh = PATH_OFFSET_PBUFFER(search_path, path_offset--); + reiserfs_restore_prepared_buffer(sb, bh); + brelse(bh); + } + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +/* Drop the reference to each buffer in a path */ +void pathrelse(struct treepath *search_path) +{ + int path_offset = search_path->path_length; + + RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "PAP-5090: invalid path offset"); + + while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) + brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--)); + + search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + struct item_head *ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { + reiserfs_warning(NULL, "reiserfs-5080", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning(NULL, "reiserfs-5081", + "nr_item seems wrong: %z", bh); + return 0; + } + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + + /* free space does not match to calculated amount of use space */ + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, "reiserfs-5082", + "free space seems wrong: %z", bh); + return 0; + } + /* + * FIXME: it is_leaf will hit performance too much - we may have + * return 1 here + */ + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i++, ih++) { + if (le_ih_k_type(ih) == TYPE_ANY) { + reiserfs_warning(NULL, "reiserfs-5083", + "wrong item type for item %h", + ih); + return 0; + } + if (ih_location(ih) >= blocksize + || ih_location(ih) < IH_SIZE * nr) { + reiserfs_warning(NULL, "reiserfs-5084", + "item location seems wrong: %h", + ih); + return 0; + } + if (ih_item_len(ih) < 1 + || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { + reiserfs_warning(NULL, "reiserfs-5085", + "item length seems wrong: %h", + ih); + return 0; + } + if (prev_location - ih_location(ih) != ih_item_len(ih)) { + reiserfs_warning(NULL, "reiserfs-5086", + "item location seems wrong " + "(second one): %h", ih); + return 0; + } + prev_location = ih_location(ih); + } + + /* one may imagine many more checks */ + return 1; +} + +/* returns 1 if buf looks like an internal node, 0 otherwise */ +static int is_internal(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + nr = blkh_level(blkh); + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + reiserfs_warning(NULL, "reiserfs-5087", + "this should be caught earlier"); + return 0; + } + + nr = blkh_nr_item(blkh); + /* for internal which is not root we might check min number of keys */ + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + reiserfs_warning(NULL, "reiserfs-5088", + "number of key seems wrong: %z", bh); + return 0; + } + + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, "reiserfs-5089", + "free space seems wrong: %z", bh); + return 0; + } + + /* one may imagine many more checks */ + return 1; +} + +/* + * make sure that bh contains formatted node of reiserfs tree of + * 'level'-th level + */ +static int is_tree_node(struct buffer_head *bh, int level) +{ + if (B_LEVEL(bh) != level) { + reiserfs_warning(NULL, "reiserfs-5090", "node level %d does " + "not match to the expected one %d", + B_LEVEL(bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf(bh->b_data, bh->b_size, bh); + + return is_internal(bh->b_data, bh->b_size, bh); +} + +#define SEARCH_BY_KEY_READA 16 + +/* + * The function is NOT SCHEDULE-SAFE! + * It might unlock the write lock if we needed to wait for a block + * to be read. Note that in this case it won't recover the lock to avoid + * high contention resulting from too much lock requests, especially + * the caller (search_by_key) will perform other schedule-unsafe + * operations just after calling this function. + * + * @return depth of lock to be restored after read completes + */ +static int search_by_key_reada(struct super_block *s, + struct buffer_head **bh, + b_blocknr_t *b, int num) +{ + int i, j; + int depth = -1; + + for (i = 0; i < num; i++) { + bh[i] = sb_getblk(s, b[i]); + } + /* + * We are going to read some blocks on which we + * have a reference. It's safe, though we might be + * reading blocks concurrently changed if we release + * the lock. But it's still fine because we check later + * if the tree changed + */ + for (j = 0; j < i; j++) { + /* + * note, this needs attention if we are getting rid of the BKL + * you have to make sure the prepared bit isn't set on this + * buffer + */ + if (!buffer_uptodate(bh[j])) { + if (depth == -1) + depth = reiserfs_write_unlock_nested(s); + ll_rw_block(READA, 1, bh + j); + } + brelse(bh[j]); + } + return depth; +} + +/* + * This function fills up the path from the root to the leaf as it + * descends the tree looking for the key. It uses reiserfs_bread to + * try to find buffers in the cache given their block number. If it + * does not find them in the cache it reads them from disk. For each + * node search_by_key finds using reiserfs_bread it then uses + * bin_search to look through that node. bin_search will find the + * position of the block_number of the next node if it is looking + * through an internal node. If it is looking through a leaf node + * bin_search will find the position of the item which has key either + * equal to given key, or which is the maximal key less than the given + * key. search_by_key returns a path that must be checked for the + * correctness of the top of the path but need not be checked for the + * correctness of the bottom of the path + */ +/* + * search_by_key - search for key (and item) in stree + * @sb: superblock + * @key: pointer to key to search for + * @search_path: Allocated and initialized struct treepath; Returned filled + * on success. + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to + * stop at leaf level. + * + * The function is NOT SCHEDULE-SAFE! + */ +int search_by_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *search_path, int stop_level) +{ + b_blocknr_t block_number; + int expected_level; + struct buffer_head *bh; + struct path_element *last_element; + int node_level, retval; + int right_neighbor_of_leaf_node; + int fs_gen; + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; + b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; + int reada_count = 0; + +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + + PROC_INFO_INC(sb, search_by_key); + + /* + * As we add each node to a path we increase its count. This means + * that we must be careful to release all nodes in a path before we + * either discard the path struct or re-use the path struct, as we + * do here. + */ + + pathrelse(search_path); + + right_neighbor_of_leaf_node = 0; + + /* + * With each iteration of this loop we search through the items in the + * current node, and calculate the next current node(next path element) + * for the next iteration of this loop.. + */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + while (1) { + +#ifdef CONFIG_REISERFS_CHECK + if (!(++repeat_counter % 50000)) + reiserfs_warning(sb, "PAP-5100", + "%s: there were %d iterations of " + "while loop looking for key %K", + current->comm, repeat_counter, + key); +#endif + + /* prep path to have another element added to it. */ + last_element = + PATH_OFFSET_PELEMENT(search_path, + ++search_path->path_length); + fs_gen = get_generation(sb); + + /* + * Read the next tree node, and set the last element + * in the path to have a pointer to it. + */ + if ((bh = last_element->pe_buffer = + sb_getblk(sb, block_number))) { + + /* + * We'll need to drop the lock if we encounter any + * buffers that need to be read. If all of them are + * already up to date, we don't need to drop the lock. + */ + int depth = -1; + + if (!buffer_uptodate(bh) && reada_count > 1) + depth = search_by_key_reada(sb, reada_bh, + reada_blocks, reada_count); + + if (!buffer_uptodate(bh) && depth == -1) + depth = reiserfs_write_unlock_nested(sb); + + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + + if (depth != -1) + reiserfs_write_lock_nested(sb, depth); + if (!buffer_uptodate(bh)) + goto io_error; + } else { +io_error: + search_path->path_length--; + pathrelse(search_path); + return IO_ERROR; + } + reada_count = 0; + if (expected_level == -1) + expected_level = SB_TREE_HEIGHT(sb); + expected_level--; + + /* + * It is possible that schedule occurred. We must check + * whether the key to search is still in the tree rooted + * from the current buffer. If not then repeat search + * from the root. + */ + if (fs_changed(fs_gen, sb) && + (!B_IS_IN_TREE(bh) || + B_LEVEL(bh) != expected_level || + !key_in_buffer(search_path, key, sb))) { + PROC_INFO_INC(sb, search_by_key_fs_changed); + PROC_INFO_INC(sb, search_by_key_restarted); + PROC_INFO_INC(sb, + sbk_restarted[expected_level - 1]); + pathrelse(search_path); + + /* + * Get the root block number so that we can + * repeat the search starting from the root. + */ + block_number = SB_ROOT_BLOCK(sb); + expected_level = -1; + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } + + /* + * only check that the key is in the buffer if key is not + * equal to the MAX_KEY. Latter case is only possible in + * "finish_unfinished()" processing during mount. + */ + RFALSE(comp_keys(&MAX_KEY, key) && + !key_in_buffer(search_path, key, sb), + "PAP-5130: key is not in the buffer"); +#ifdef CONFIG_REISERFS_CHECK + if (REISERFS_SB(sb)->cur_tb) { + print_cur_tb("5140"); + reiserfs_panic(sb, "PAP-5140", + "schedule occurred in do_balance!"); + } +#endif + + /* + * make sure, that the node contents look like a node of + * certain level + */ + if (!is_tree_node(bh, expected_level)) { + reiserfs_error(sb, "vs-5150", + "invalid format found in block %ld. " + "Fsck?", bh->b_blocknr); + pathrelse(search_path); + return IO_ERROR; + } + + /* ok, we have acquired next formatted node in the tree */ + node_level = B_LEVEL(bh); + + PROC_INFO_BH_STAT(sb, bh, node_level - 1); + + RFALSE(node_level < stop_level, + "vs-5152: tree level (%d) is less than stop level (%d)", + node_level, stop_level); + + retval = bin_search(key, item_head(bh, 0), + B_NR_ITEMS(bh), + (node_level == + DISK_LEAF_NODE_LEVEL) ? IH_SIZE : + KEY_SIZE, + &last_element->pe_position); + if (node_level == stop_level) { + return retval; + } + + /* we are not in the stop level */ + /* + * item has been found, so we choose the pointer which + * is to the right of the found one + */ + if (retval == ITEM_FOUND) + last_element->pe_position++; + + /* + * if item was not found we choose the position which is to + * the left of the found item. This requires no code, + * bin_search did it already. + */ + + /* + * So we have chosen a position in the current node which is + * an internal node. Now we calculate child block number by + * position in the node. + */ + block_number = + B_N_CHILD_NUM(bh, last_element->pe_position); + + /* + * if we are going to read leaf nodes, try for read + * ahead as well + */ + if ((search_path->reada & PATH_READA) && + node_level == DISK_LEAF_NODE_LEVEL + 1) { + int pos = last_element->pe_position; + int limit = B_NR_ITEMS(bh); + struct reiserfs_key *le_key; + + if (search_path->reada & PATH_READA_BACK) + limit = 0; + while (reada_count < SEARCH_BY_KEY_READA) { + if (pos == limit) + break; + reada_blocks[reada_count++] = + B_N_CHILD_NUM(bh, pos); + if (search_path->reada & PATH_READA_BACK) + pos--; + else + pos++; + + /* + * check to make sure we're in the same object + */ + le_key = internal_key(bh, pos); + if (le32_to_cpu(le_key->k_objectid) != + key->on_disk_key.k_objectid) { + break; + } + } + } + } +} + +/* + * Form the path to an item and position in this item which contains + * file byte defined by key. If there is no such item + * corresponding to the key, we point the path to the item with + * maximal key less than key, and *pos_in_item is set to one + * past the last entry/byte in the item. If searching for entry in a + * directory item, and it is not found, *pos_in_item is set to one + * entry more than the entry with maximal key which is less than the + * sought key. + * + * Note that if there is no entry in this same node which is one more, + * then we point to an imaginary entry. for direct items, the + * position is in units of bytes, for indirect items the position is + * in units of blocknr entries, for directory items the position is in + * units of directory entries. + */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_for_position_by_key(struct super_block *sb, + /* Key to search (cpu variable) */ + const struct cpu_key *p_cpu_key, + /* Filled up by this function. */ + struct treepath *search_path) +{ + struct item_head *p_le_ih; /* pointer to on-disk structure */ + int blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if (is_direntry_cpu_key(p_cpu_key)) + return search_by_entry_key(sb, p_cpu_key, search_path, + &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item(sb, p_cpu_key, search_path); + if (retval == IO_ERROR) + return retval; + if (retval == ITEM_FOUND) { + + RFALSE(!ih_item_len + (item_head + (PATH_PLAST_BUFFER(search_path), + PATH_LAST_POSITION(search_path))), + "PAP-5165: item length equals zero"); + + pos_in_item(search_path) = 0; + return POSITION_FOUND; + } + + RFALSE(!PATH_LAST_POSITION(search_path), + "PAP-5170: position equals zero"); + + /* Item is not found. Set path to the previous item. */ + p_le_ih = + item_head(PATH_PLAST_BUFFER(search_path), + --PATH_LAST_POSITION(search_path)); + blk_size = sb->s_blocksize; + + if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key)) + return FILE_NOT_FOUND; + + /* FIXME: quite ugly this far */ + + item_offset = le_ih_k_offset(p_le_ih); + offset = cpu_key_k_offset(p_cpu_key); + + /* Needed byte is contained in the item pointed to by the path. */ + if (item_offset <= offset && + item_offset + op_bytes_number(p_le_ih, blk_size) > offset) { + pos_in_item(search_path) = offset - item_offset; + if (is_indirect_le_ih(p_le_ih)) { + pos_in_item(search_path) /= blk_size; + } + return POSITION_FOUND; + } + + /* + * Needed byte is not contained in the item pointed to by the + * path. Set pos_in_item out of the item. + */ + if (is_indirect_le_ih(p_le_ih)) + pos_in_item(search_path) = + ih_item_len(p_le_ih) / UNFM_P_SIZE; + else + pos_in_item(search_path) = ih_item_len(p_le_ih); + + return POSITION_NOT_FOUND; +} + +/* Compare given item and item pointed to by the path. */ +int comp_items(const struct item_head *stored_ih, const struct treepath *path) +{ + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + struct item_head *ih; + + /* Last buffer at the path is not in the tree. */ + if (!B_IS_IN_TREE(bh)) + return 1; + + /* Last path position is invalid. */ + if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh)) + return 1; + + /* we need only to know, whether it is the same item */ + ih = tp_item_head(path); + return memcmp(stored_ih, ih, IH_SIZE); +} + +/* unformatted nodes are not logged anymore, ever. This is safe now */ +#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) + +/* block can not be forgotten as it is in I/O or held by someone */ +#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) + +/* prepare for delete or cut of direct item */ +static inline int prepare_for_direct_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, int *cut_size) +{ + loff_t round_len; + + if (new_file_length == max_reiserfs_offset(inode)) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + /* new file gets truncated */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { + round_len = ROUND_UP(new_file_length); + /* this was new_file_length < le_ih ... */ + if (round_len < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1); + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + /* old file: items may have any length */ + + if (new_file_length < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + + /* Calculate first position and size for cutting from item. */ + *cut_size = -(ih_item_len(le_ih) - + (pos_in_item(path) = + new_file_length + 1 - le_ih_k_offset(le_ih))); + return M_CUT; /* Cut from this item. */ +} + +static inline int prepare_for_direntry_item(struct treepath *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, + int *cut_size) +{ + if (le_ih_k_offset(le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset(inode)) { + RFALSE(ih_entry_count(le_ih) != 2, + "PAP-5220: incorrect empty directory item (%h)", le_ih); + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + /* Delete the directory item containing "." and ".." entry. */ + return M_DELETE; + } + + if (ih_entry_count(le_ih) == 1) { + /* + * Delete the directory item such as there is one record only + * in this item + */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = + -(DEH_SIZE + + entry_length(get_last_bh(path), le_ih, pos_in_item(path))); + return M_CUT; +} + +#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) + +/* + * If the path points to a directory or direct item, calculate mode + * and the size cut, for balance. + * If the path points to an indirect item, remove some number of its + * unformatted nodes. + * In case of file truncate calculate whether this item must be + * deleted/truncated or last unformatted node of this item will be + * converted to a direct item. + * This function returns a determination of what balance mode the + * calling function should employ. + */ +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct treepath *path, + const struct cpu_key *item_key, + /* + * Number of unformatted nodes + * which were removed from end + * of the file. + */ + int *removed, + int *cut_size, + /* MAX_KEY_OFFSET in case of delete. */ + unsigned long long new_file_length + ) +{ + struct super_block *sb = inode->i_sb; + struct item_head *p_le_ih = tp_item_head(path); + struct buffer_head *bh = PATH_PLAST_BUFFER(path); + + BUG_ON(!th->t_trans_id); + + /* Stat_data item. */ + if (is_statdata_le_ih(p_le_ih)) { + + RFALSE(new_file_length != max_reiserfs_offset(inode), + "PAP-5210: mode must be M_DELETE"); + + *cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); + return M_DELETE; + } + + /* Directory item. */ + if (is_direntry_le_ih(p_le_ih)) + return prepare_for_direntry_item(path, p_le_ih, inode, + new_file_length, + cut_size); + + /* Direct item. */ + if (is_direct_le_ih(p_le_ih)) + return prepare_for_direct_item(path, p_le_ih, inode, + new_file_length, cut_size); + + /* Case of an indirect item. */ + { + int blk_size = sb->s_blocksize; + struct item_head s_ih; + int need_re_search; + int delete = 0; + int result = M_CUT; + int pos = 0; + + if ( new_file_length == max_reiserfs_offset (inode) ) { + /* + * prepare_for_delete_or_cut() is called by + * reiserfs_delete_item() + */ + new_file_length = 0; + delete = 1; + } + + do { + need_re_search = 0; + *cut_size = 0; + bh = PATH_PLAST_BUFFER(path); + copy_item_head(&s_ih, tp_item_head(path)); + pos = I_UNFM_NUM(&s_ih); + + while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { + __le32 *unfm; + __u32 block; + + /* + * Each unformatted block deletion may involve + * one additional bitmap block into the transaction, + * thereby the initial journal space reservation + * might not be enough. + */ + if (!delete && (*cut_size) != 0 && + reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) + break; + + unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1; + block = get_block_num(unfm, 0); + + if (block != 0) { + reiserfs_prepare_for_journal(sb, bh, 1); + put_block_num(unfm, 0, 0); + journal_mark_dirty(th, bh); + reiserfs_free_block(th, inode, block, 1); + } + + reiserfs_cond_resched(sb); + + if (item_moved (&s_ih, path)) { + need_re_search = 1; + break; + } + + pos --; + (*removed)++; + (*cut_size) -= UNFM_P_SIZE; + + if (pos == 0) { + (*cut_size) -= IH_SIZE; + result = M_DELETE; + break; + } + } + /* + * a trick. If the buffer has been logged, this will + * do nothing. If we've broken the loop without logging + * it, it will restore the buffer + */ + reiserfs_restore_prepared_buffer(sb, bh); + } while (need_re_search && + search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); + pos_in_item(path) = pos * UNFM_P_SIZE; + + if (*cut_size == 0) { + /* + * Nothing was cut. maybe convert last unformatted node to the + * direct item? + */ + result = M_CONVERT; + } + return result; + } +} + +/* Calculate number of bytes which will be deleted or cut during balance */ +static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) +{ + int del_size; + struct item_head *p_le_ih = tp_item_head(tb->tb_path); + + if (is_statdata_le_ih(p_le_ih)) + return 0; + + del_size = + (mode == + M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; + if (is_direntry_le_ih(p_le_ih)) { + /* + * return EMPTY_DIR_SIZE; We delete emty directories only. + * we can't use EMPTY_DIR_SIZE, as old format dirs have a + * different empty size. ick. FIXME, is this right? + */ + return del_size; + } + + if (is_indirect_le_ih(p_le_ih)) + del_size = (del_size / UNFM_P_SIZE) * + (PATH_PLAST_BUFFER(tb->tb_path)->b_size); + return del_size; +} + +static void init_tb_struct(struct reiserfs_transaction_handle *th, + struct tree_balance *tb, + struct super_block *sb, + struct treepath *path, int size) +{ + + BUG_ON(!th->t_trans_id); + + memset(tb, '\0', sizeof(struct tree_balance)); + tb->transaction_handle = th; + tb->tb_sb = sb; + tb->tb_path = path; + PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + tb->insert_size[0] = size; +} + +void padd_item(char *item, int total_length, int length) +{ + int i; + + for (i = total_length; i > length;) + item[--i] = 0; +} + +#ifdef REISERQUOTA_DEBUG +char key2type(struct reiserfs_key *ih) +{ + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; +} + +char head2type(struct item_head *ih) +{ + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; +} +#endif + +/* + * Delete object item. + * th - active transaction handle + * path - path to the deleted item + * item_key - key to search for the deleted item + * indode - used for updating i_blocks and quotas + * un_bh - NULL or unformatted node pointer + */ +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *item_key, + struct inode *inode, struct buffer_head *un_bh) +{ + struct super_block *sb = inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; + int ret_value, del_size, removed; + int depth; + +#ifdef CONFIG_REISERFS_CHECK + char mode; + int iter = 0; +#endif + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_del_balance, sb, path, + 0 /*size is unknown */ ); + + while (1) { + removed = 0; + +#ifdef CONFIG_REISERFS_CHECK + iter++; + mode = +#endif + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &del_size, + max_reiserfs_offset(inode)); + + RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); + + copy_item_head(&s_ih, tp_item_head(path)); + s_del_balance.insert_size[0] = del_size; + + ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, delete_item_restarted); + + /* file system changed, repeat search */ + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == IO_ERROR) + break; + if (ret_value == FILE_NOT_FOUND) { + reiserfs_warning(sb, "vs-5340", + "no items of the file %K found", + item_key); + break; + } + } /* while (1) */ + + if (ret_value != CARRY_ON) { + unfix_nodes(&s_del_balance); + return 0; + } + + /* reiserfs_delete_item returns item length when success */ + ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = tp_item_head(path); + quota_cut_bytes = ih_item_len(q_ih); + + /* + * hack so the quota code doesn't have to guess if the file has a + * tail. On tail insert, we allocate quota for 1 unformatted node. + * We test the offset because the tail might have been + * split into multiple items, and we only want to decrement for + * the unfm node once + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } + + if (un_bh) { + int off; + char *data; + + /* + * We are in direct2indirect conversion, so move tail contents + * to the unformatted node + */ + /* + * note, we do the copy before preparing the buffer because we + * don't care about the contents of the unformatted node yet. + * the only thing we really care about is the direct item's + * data is in the unformatted node. + * + * Otherwise, we would have to call + * reiserfs_prepare_for_journal on the unformatted node, + * which might schedule, meaning we'd have to loop all the + * way back up to the start of the while loop. + * + * The unformatted node must be dirtied later on. We can't be + * sure here if the entire tail has been deleted yet. + * + * un_bh is from the page cache (all unformatted nodes are + * from the page cache) and might be a highmem page. So, we + * can't use un_bh->b_data. + * -clm + */ + + data = kmap_atomic(un_bh->b_page); + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + memcpy(data + off, + ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), + ret_value); + kunmap_atomic(data); + } + + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(sb, REISERFS_DEBUG_CODE, + "reiserquota delete_item(): freeing %u, id=%u type=%c", + quota_cut_bytes, inode->i_uid, head2type(&s_ih)); +#endif + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + + /* Return deleted body length */ + return ret_value; +} + +/* + * Summary Of Mechanisms For Handling Collisions Between Processes: + * + * deletion of the body of the object is performed by iput(), with the + * result that if multiple processes are operating on a file, the + * deletion of the body of the file is deferred until the last process + * that has an open inode performs its iput(). + * + * writes and truncates are protected from collisions by use of + * semaphores. + * + * creates, linking, and mknod are protected from collisions with other + * processes by making the reiserfs_add_entry() the last step in the + * creation, and then rolling back all changes if there was a collision. + * - Hans +*/ + +/* this deletes item which never gets split */ +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key) +{ + struct super_block *sb = th->t_super; + struct tree_balance tb; + INITIALIZE_PATH(path); + int item_len = 0; + int tb_init = 0; + struct cpu_key cpu_key; + int retval; + int quota_cut_bytes = 0; + + BUG_ON(!th->t_trans_id); + + le_key2cpu_key(&cpu_key, key); + + while (1) { + retval = search_item(th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_error(th->t_super, "vs-5350", + "i/o failure occurred trying " + "to delete %K", &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse(&path); + /* + * No need for a warning, if there is just no free + * space to insert '..' item into the + * newly-created subdir + */ + if (! + ((unsigned long long) + GET_HASH_VALUE(le_key_k_offset + (le_key_version(key), key)) == 0 + && (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset + (le_key_version(key), + key)) == 1)) + reiserfs_warning(th->t_super, "vs-5355", + "%k not found", key); + break; + } + if (!tb_init) { + tb_init = 1; + item_len = ih_item_len(tp_item_head(&path)); + init_tb_struct(th, &tb, th->t_super, &path, + -(IH_SIZE + item_len)); + } + quota_cut_bytes = ih_item_len(tp_item_head(&path)); + + retval = fix_nodes(M_DELETE, &tb, NULL, NULL); + if (retval == REPEAT_SEARCH) { + PROC_INFO_INC(th->t_super, delete_solid_item_restarted); + continue; + } + + if (retval == CARRY_ON) { + do_balance(&tb, NULL, NULL, M_DELETE); + /* + * Should we count quota for item? (we don't + * count quotas for save-links) + */ + if (inode) { + int depth; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota delete_solid_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, + key2type(key)); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, + quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); + } + break; + } + + /* IO_ERROR, NO_DISK_SPACE, etc */ + reiserfs_warning(th->t_super, "vs-5360", + "could not delete %K due to fix_nodes failure", + &cpu_key); + unfix_nodes(&tb); + break; + } + + reiserfs_check_path(&path); +} + +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + int err; + inode->i_size = 0; + BUG_ON(!th->t_trans_id); + + /* for directory this deletes item containing "." and ".." */ + err = + reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ ); + if (err) + return err; + +#if defined( USE_INODE_GENERATION_COUNTER ) + if (!old_format_only(th->t_super)) { + __le32 *inode_generation; + + inode_generation = + &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; + le32_add_cpu(inode_generation, 1); + } +/* USE_INODE_GENERATION_COUNTER */ +#endif + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + + return err; +} + +static void unmap_buffers(struct page *page, loff_t pos) +{ + struct buffer_head *bh; + struct buffer_head *head; + struct buffer_head *next; + unsigned long tail_index; + unsigned long cur_index; + + if (page) { + if (page_has_buffers(page)) { + tail_index = pos & (PAGE_CACHE_SIZE - 1); + cur_index = 0; + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + + /* + * we want to unmap the buffers that contain + * the tail, and all the buffers after it + * (since the tail must be at the end of the + * file). We don't want to unmap file data + * before the tail, since it might be dirty + * and waiting to reach disk + */ + cur_index += bh->b_size; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } + } +} + +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct page *page, + struct treepath *path, + const struct cpu_key *item_key, + loff_t new_file_size, char *mode) +{ + struct super_block *sb = inode->i_sb; + int block_size = sb->s_blocksize; + int cut_bytes; + BUG_ON(!th->t_trans_id); + BUG_ON(new_file_size != inode->i_size); + + /* + * the page being sent in could be NULL if there was an i/o error + * reading in the last block. The user will hit problems trying to + * read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&inode->i_count) > 1 || + !tail_has_to_be_packed(inode) || + !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) { + /* leave tail in an unformatted node */ + *mode = M_SKIP_BALANCING; + cut_bytes = + block_size - (new_file_size & (block_size - 1)); + pathrelse(path); + return cut_bytes; + } + + /* Perform the conversion to a direct_item. */ + return indirect2direct(th, inode, page, path, item_key, + new_file_size, mode); +} + +/* + * we did indirect_to_direct conversion. And we have inserted direct + * item successesfully, but there were no disk space to cut unfm + * pointer being converted. Therefore we have to delete inserted + * direct item(s) + */ +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, + struct inode *inode, struct treepath *path) +{ + struct cpu_key tail_key; + int tail_len; + int removed; + BUG_ON(!th->t_trans_id); + + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); + tail_key.key_length = 4; + + tail_len = + (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key(inode->i_sb, &tail_key, path) == + POSITION_NOT_FOUND) + reiserfs_panic(inode->i_sb, "vs-5615", + "found invalid item"); + RFALSE(path->pos_in_item != + ih_item_len(tp_item_head(path)) - 1, + "vs-5616: appended bytes found"); + PATH_LAST_POSITION(path)--; + + removed = + reiserfs_delete_item(th, path, &tail_key, inode, + NULL /*unbh not needed */ ); + RFALSE(removed <= 0 + || removed > tail_len, + "vs-5617: there was tail %d bytes, removed item length %d bytes", + tail_len, removed); + tail_len -= removed; + set_cpu_key_k_offset(&tail_key, + cpu_key_k_offset(&tail_key) - removed); + } + reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " + "conversion has been rolled back due to " + "lack of disk space"); + mark_inode_dirty(inode); +} + +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct treepath *path, + struct cpu_key *item_key, + struct inode *inode, + struct page *page, loff_t new_file_size) +{ + struct super_block *sb = inode->i_sb; + /* + * Every function which is going to call do_balance must first + * create a tree_balance structure. Then it must fill up this + * structure by using the init_tb_struct and fix_nodes functions. + * After that we can make tree balancing. + */ + struct tree_balance s_cut_balance; + struct item_head *p_le_ih; + int cut_size = 0; /* Amount to be cut. */ + int ret_value = CARRY_ON; + int removed = 0; /* Number of the removed unformatted nodes. */ + int is_inode_locked = 0; + char mode; /* Mode of the balance. */ + int retval2 = -1; + int quota_cut_bytes; + loff_t tail_pos = 0; + int depth; + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_cut_balance, inode->i_sb, path, + cut_size); + + /* + * Repeat this loop until we either cut the item without needing + * to balance, or we fix_nodes without schedule occurring + */ + while (1) { + /* + * Determine the balance mode, position of the first byte to + * be cut, and size to be cut. In case of the indirect item + * free unformatted nodes which are pointed to by the cut + * pointers. + */ + + mode = + prepare_for_delete_or_cut(th, inode, path, + item_key, &removed, + &cut_size, new_file_size); + if (mode == M_CONVERT) { + /* + * convert last unformatted node to direct item or + * leave tail in the unformatted node + */ + RFALSE(ret_value != CARRY_ON, + "PAP-5570: can not convert twice"); + + ret_value = + maybe_indirect_to_direct(th, inode, page, + path, item_key, + new_file_size, &mode); + if (mode == M_SKIP_BALANCING) + /* tail has been left in the unformatted node */ + return ret_value; + + is_inode_locked = 1; + + /* + * removing of last unformatted node will + * change value we have to return to truncate. + * Save it + */ + retval2 = ret_value; + + /* + * So, we have performed the first part of the + * conversion: + * inserting the new direct item. Now we are + * removing the last unformatted node pointer. + * Set key to search for it. + */ + set_cpu_key_k_type(item_key, TYPE_INDIRECT); + item_key->key_length = 4; + new_file_size -= + (new_file_size & (sb->s_blocksize - 1)); + tail_pos = new_file_size; + set_cpu_key_k_offset(item_key, new_file_size + 1); + if (search_for_position_by_key + (sb, item_key, + path) == POSITION_NOT_FOUND) { + print_block(PATH_PLAST_BUFFER(path), 3, + PATH_LAST_POSITION(path) - 1, + PATH_LAST_POSITION(path) + 1); + reiserfs_panic(sb, "PAP-5580", "item to " + "convert does not exist (%K)", + item_key); + } + continue; + } + if (cut_size == 0) { + pathrelse(path); + return 0; + } + + s_cut_balance.insert_size[0] = cut_size; + + ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL); + if (ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(sb, cut_from_item_restarted); + + ret_value = + search_for_position_by_key(sb, item_key, path); + if (ret_value == POSITION_FOUND) + continue; + + reiserfs_warning(sb, "PAP-5610", "item %K not found", + item_key); + unfix_nodes(&s_cut_balance); + return (ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */ + if (ret_value != CARRY_ON) { + if (is_inode_locked) { + /* + * FIXME: this seems to be not needed: we are always + * able to cut item + */ + indirect_to_direct_roll_back(th, inode, path); + } + if (ret_value == NO_DISK_SPACE) + reiserfs_warning(sb, "reiserfs-5092", + "NO_DISK_SPACE"); + unfix_nodes(&s_cut_balance); + return -EIO; + } + + /* go ahead and perform balancing */ + + RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode"); + + /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = + (mode == + M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance. + insert_size[0]; + if (retval2 == -1) + ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); + else + ret_value = retval2; + + /* + * For direct items, we only change the quota when deleting the last + * item. + */ + p_le_ih = tp_item_head(s_cut_balance.tb_path); + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (mode == M_DELETE && + (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == + 1) { + /* FIXME: this is to keep 3.5 happy */ + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } + } +#ifdef CONFIG_REISERFS_CHECK + if (is_inode_locked) { + struct item_head *le_ih = + tp_item_head(s_cut_balance.tb_path); + /* + * we are going to complete indirect2direct conversion. Make + * sure, that we exactly remove last unformatted node pointer + * of the item + */ + if (!is_indirect_le_ih(le_ih)) + reiserfs_panic(sb, "vs-5652", + "item must be indirect %h", le_ih); + + if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) + reiserfs_panic(sb, "vs-5653", "completing " + "indirect2direct conversion indirect " + "item %h being deleted must be of " + "4 byte long", le_ih); + + if (mode == M_CUT + && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic(sb, "vs-5654", "can not complete " + "indirect2direct conversion of %h " + "(CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* + * it would be useful to make sure, that right neighboring + * item is direct item of this file + */ + } +#endif + + do_balance(&s_cut_balance, NULL, NULL, mode); + if (is_inode_locked) { + /* + * we've done an indirect->direct conversion. when the + * data block was freed, it was removed from the list of + * blocks that must be flushed before the transaction + * commits, make sure to unmap and invalidate it + */ + unmap_buffers(page, tail_pos); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + } +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota cut_from_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, '?'); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); + return ret_value; +} + +static void truncate_directory(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + BUG_ON(!th->t_trans_id); + if (inode->i_nlink) + reiserfs_error(inode->i_sb, "vs-5655", "link count != 0"); + + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + reiserfs_update_sd(th, inode); + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); +} + +/* + * Truncate file to the new size. Note, this must be called with a + * transaction already started + */ +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, + struct inode *inode, /* ->i_size contains new size */ + struct page *page, /* up to date for last block */ + /* + * when it is called by file_release to convert + * the tail - no timestamps should be updated + */ + int update_timestamps + ) +{ + INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ + struct item_head *p_le_ih; /* Pointer to an item header. */ + + /* Key to search for a previous file item. */ + struct cpu_key s_item_key; + loff_t file_size, /* Old file size. */ + new_file_size; /* New file size. */ + int deleted; /* Number of deleted or truncated bytes. */ + int retval; + int err = 0; + + BUG_ON(!th->t_trans_id); + if (! + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + return 0; + + /* deletion of directory - no need to update timestamps */ + if (S_ISDIR(inode->i_mode)) { + truncate_directory(th, inode); + return 0; + } + + /* Get new file size. */ + new_file_size = inode->i_size; + + /* FIXME: note, that key type is unimportant here */ + make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), + TYPE_DIRECT, 3); + + retval = + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path); + if (retval == IO_ERROR) { + reiserfs_error(inode->i_sb, "vs-5657", + "i/o failure occurred trying to truncate %K", + &s_item_key); + err = -EIO; + goto out; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_error(inode->i_sb, "PAP-5660", + "wrong result %d of search for %K", retval, + &s_item_key); + + err = -EIO; + goto out; + } + + s_search_path.pos_in_item--; + + /* Get real file size (total length of all file items) */ + p_le_ih = tp_item_head(&s_search_path); + if (is_statdata_le_ih(p_le_ih)) + file_size = 0; + else { + loff_t offset = le_ih_k_offset(p_le_ih); + int bytes = + op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); + + /* + * this may mismatch with real file size: if last direct item + * had no padding zeros and last unformatted node had no free + * space, this file would have this file size + */ + file_size = offset + bytes - 1; + } + /* + * are we doing a full truncate or delete, if so + * kick in the reada code + */ + if (new_file_size == 0) + s_search_path.reada = PATH_READA | PATH_READA_BACK; + + if (file_size == 0 || file_size < new_file_size) { + goto update_and_out; + } + + /* Update key to search for the last file item. */ + set_cpu_key_k_offset(&s_item_key, file_size); + + do { + /* Cut or delete file item. */ + deleted = + reiserfs_cut_from_item(th, &s_search_path, &s_item_key, + inode, page, new_file_size); + if (deleted < 0) { + reiserfs_warning(inode->i_sb, "vs-5665", + "reiserfs_cut_from_item failed"); + reiserfs_check_path(&s_search_path); + return 0; + } + + RFALSE(deleted > file_size, + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", + deleted, file_size, &s_item_key); + + /* Change key to search the last file item. */ + file_size -= deleted; + + set_cpu_key_k_offset(&s_item_key, file_size); + + /* + * While there are bytes to truncate and previous + * file item is presented in the tree. + */ + + /* + * This loop could take a really long time, and could log + * many more blocks than a transaction can hold. So, we do + * a polite journal end here, and if the transaction needs + * ending, we make sure the file is consistent before ending + * the current trans and starting a new one + */ + if (journal_transaction_should_end(th, 0) || + reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { + pathrelse(&s_search_path); + + if (update_timestamps) { + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + + err = journal_end(th); + if (err) + goto out; + err = journal_begin(th, inode->i_sb, + JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ; + if (err) + goto out; + reiserfs_update_inode_transaction(inode); + } + } while (file_size > ROUND_UP(new_file_size) && + search_for_position_by_key(inode->i_sb, &s_item_key, + &s_search_path) == POSITION_FOUND); + + RFALSE(file_size > ROUND_UP(new_file_size), + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", + new_file_size, file_size, s_item_key.on_disk_key.k_objectid); + +update_and_out: + if (update_timestamps) { + /* this is truncate, not file closing */ + inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, inode); + +out: + pathrelse(&s_search_path); + return err; +} + +#ifdef CONFIG_REISERFS_CHECK +/* this makes sure, that we __append__, not overwrite or add holes */ +static void check_research_for_paste(struct treepath *path, + const struct cpu_key *key) +{ + struct item_head *found_ih = tp_item_head(path); + + if (is_direct_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + pos_in_item(path)) + reiserfs_panic(NULL, "PAP-5720", "found direct item " + "%h or position (%d) does not match " + "to key %K", found_ih, + pos_in_item(path), key); + } + if (is_indirect_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(key) + || I_UNFM_NUM(found_ih) != pos_in_item(path) + || get_ih_free_space(found_ih) != 0) + reiserfs_panic(NULL, "PAP-5730", "found indirect " + "item (%h) or position (%d) does not " + "match to key (%K)", + found_ih, pos_in_item(path), key); + } +} +#endif /* config reiserfs check */ + +/* + * Paste bytes to the existing item. + * Returns bytes number pasted into the item. + */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + /* Path to the pasted item. */ + struct treepath *search_path, + /* Key to search for the needed item. */ + const struct cpu_key *key, + /* Inode item belongs to */ + struct inode *inode, + /* Pointer to the bytes to paste. */ + const char *body, + /* Size of pasted bytes. */ + int pasted_size) +{ + struct super_block *sb = inode->i_sb; + struct tree_balance s_paste_balance; + int retval; + int fs_gen; + int depth; + + BUG_ON(!th->t_trans_id); + + fs_gen = get_generation(inode->i_sb); + +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): allocating %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&key->on_disk_key)); +#endif + + depth = reiserfs_write_unlock_nested(sb); + retval = dquot_alloc_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); + if (retval) { + pathrelse(search_path); + return retval; + } + init_tb_struct(th, &s_paste_balance, th->t_super, search_path, + pasted_size); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_paste_balance.key = key->on_disk_key; +#endif + + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_PASTE, &s_paste_balance, NULL, + body)) == REPEAT_SEARCH) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, paste_into_item_restarted); + retval = + search_for_position_by_key(th->t_super, key, + search_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, "PAP-5710", + "entry or pasted byte (%K) exists", + key); + retval = -EEXIST; + goto error_out; + } +#ifdef CONFIG_REISERFS_CHECK + check_research_for_paste(search_path, key); +#endif + } + + /* + * Perform balancing after all resources are collected by fix_nodes, + * and accessing them will not risk triggering schedule. + */ + if (retval == CARRY_ON) { + do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); + return 0; + } + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* this also releases the path */ + unfix_nodes(&s_paste_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): freeing %u id=%u type=%c", + pasted_size, inode->i_uid, + key2type(&key->on_disk_key)); +#endif + depth = reiserfs_write_unlock_nested(sb); + dquot_free_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); + return retval; +} + +/* + * Insert new item into the buffer at the path. + * th - active transaction handle + * path - path to the inserted item + * ih - pointer to the item header to insert + * body - pointer to the bytes to insert + */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct treepath *path, const struct cpu_key *key, + struct item_head *ih, struct inode *inode, + const char *body) +{ + struct tree_balance s_ins_balance; + int retval; + int fs_gen = 0; + int quota_bytes = 0; + + BUG_ON(!th->t_trans_id); + + if (inode) { /* Do we count quotas for item? */ + int depth; + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(ih); + + /* + * hack so the quota code doesn't have to guess + * if the file has a tail, links are always tails, + * so there's no guessing needed + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): allocating %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + /* + * We can't dirty inode here. It would be immediately + * written but appropriate stat item isn't inserted yet... + */ + depth = reiserfs_write_unlock_nested(inode->i_sb); + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + if (retval) { + pathrelse(path); + return retval; + } + } + init_tb_struct(th, &s_ins_balance, th->t_super, path, + IH_SIZE + ih_item_len(ih)); +#ifdef DISPLACE_NEW_PACKING_LOCALITIES + s_ins_balance.key = key->on_disk_key; +#endif + /* + * DQUOT_* can schedule, must check to be sure calling + * fix_nodes is safe + */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = + fix_nodes(M_INSERT, &s_ins_balance, ih, + body)) == REPEAT_SEARCH) { +search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, insert_item_restarted); + retval = search_item(th->t_super, key, path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == ITEM_FOUND) { + reiserfs_warning(th->t_super, "PAP-5760", + "key %K already exists in the tree", + key); + retval = -EEXIST; + goto error_out; + } + } + + /* make balancing after all resources will be collected at a time */ + if (retval == CARRY_ON) { + do_balance(&s_ins_balance, ih, body, M_INSERT); + return 0; + } + + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +error_out: + /* also releases the path */ + unfix_nodes(&s_ins_balance); +#ifdef REISERQUOTA_DEBUG + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): freeing %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(ih)); +#endif + if (inode) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_free_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + } + return retval; +} diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c new file mode 100644 index 000000000..0111ad046 --- /dev/null +++ b/fs/reiserfs/super.c @@ -0,0 +1,2563 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * Trivial changes by Alan Cox to add the LFS fixes + * + * Trivial Changes: + * Rights granted to Hans Reiser to redistribute under other terms providing + * he accepts all liability including but not limited to patent, fitness + * for purpose, and direct or indirect claims arising from failure to perform. + * + * NO WARRANTY + */ + +#include +#include +#include +#include +#include +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct file_system_type reiserfs_fs_type; + +static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; +static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; +static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; + +int is_reiserfs_3_5(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string, + strlen(reiserfs_3_5_magic_string)); +} + +int is_reiserfs_3_6(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string, + strlen(reiserfs_3_6_magic_string)); +} + +int is_reiserfs_jr(struct reiserfs_super_block *rs) +{ + return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string, + strlen(reiserfs_jr_magic_string)); +} + +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) +{ + return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) || + is_reiserfs_jr(rs)); +} + +static int reiserfs_remount(struct super_block *s, int *flags, char *data); +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); + +static int reiserfs_sync_fs(struct super_block *s, int wait) +{ + struct reiserfs_transaction_handle th; + + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots + */ + dquot_writeback_dquots(s, -1); + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th)) + reiserfs_flush_old_commits(s); + reiserfs_write_unlock(s); + return 0; +} + +static void flush_old_commits(struct work_struct *work) +{ + struct reiserfs_sb_info *sbi; + struct super_block *s; + + sbi = container_of(work, struct reiserfs_sb_info, old_work.work); + s = sbi->s_journal->j_work_sb; + + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); + + reiserfs_sync_fs(s, 1); +} + +void reiserfs_schedule_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + unsigned long delay; + + /* + * Avoid scheduling flush when sb is being shut down. It can race + * with journal shutdown and free still queued delayed work. + */ + if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE)) + return; + + spin_lock(&sbi->old_work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->old_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->old_work_lock); +} + +static void cancel_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); +} + +static int reiserfs_freeze(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + + cancel_old_flush(s); + + reiserfs_write_lock(s); + if (!(s->s_flags & MS_RDONLY)) { + int err = journal_begin(&th, s, 1); + if (err) { + reiserfs_block_writes(&th); + } else { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + reiserfs_block_writes(&th); + journal_end_sync(&th); + } + } + reiserfs_write_unlock(s); + return 0; +} + +static int reiserfs_unfreeze(struct super_block *s) +{ + reiserfs_allow_writes(s); + return 0; +} + +extern const struct in_core_key MAX_IN_CORE_KEY; + +/* + * this is used to delete "save link" when there are no items of a + * file it points to. It can either happen if unlink is completed but + * "save unlink" removal, or if file has both unlink and truncate + * pending and as unlink completes first (because key of "save link" + * protecting unlink is bigger that a key lf "save link" which + * protects truncate), so there left no items to make truncate + * completion on + */ +static int remove_save_link_only(struct super_block *s, + struct reiserfs_key *key, int oid_free) +{ + struct reiserfs_transaction_handle th; + int err; + + /* we are going to do one balancing */ + err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + reiserfs_delete_solid_item(&th, NULL, key); + if (oid_free) + /* removals are protected by direct items */ + reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); + + return journal_end(&th); +} + +#ifdef CONFIG_QUOTA +static int reiserfs_quota_on_mount(struct super_block *, int); +#endif + +/* look for uncompleted unlinks and truncates and complete them */ +static int finish_unfinished(struct super_block *s) +{ + INITIALIZE_PATH(path); + struct cpu_key max_cpu_key, obj_key; + struct reiserfs_key save_link_key, last_inode_key; + int retval = 0; + struct item_head *ih; + struct buffer_head *bh; + int item_pos; + char *item; + int done; + struct inode *inode; + int truncate; +#ifdef CONFIG_QUOTA + int i; + int ms_active_set; + int quota_enabled[REISERFS_MAXQUOTAS]; +#endif + + /* compose key to look for "save" links */ + max_cpu_key.version = KEY_FORMAT_3_5; + max_cpu_key.on_disk_key.k_dir_id = ~0U; + max_cpu_key.on_disk_key.k_objectid = ~0U; + set_cpu_key_k_offset(&max_cpu_key, ~0U); + max_cpu_key.key_length = 3; + + memset(&last_inode_key, 0, sizeof(last_inode_key)); + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + if (s->s_flags & MS_ACTIVE) { + ms_active_set = 0; + } else { + ms_active_set = 1; + s->s_flags |= MS_ACTIVE; + } + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + quota_enabled[i] = 1; + if (REISERFS_SB(s)->s_qf_names[i]) { + int ret; + + if (sb_has_quota_active(s, i)) { + quota_enabled[i] = 0; + continue; + } + ret = reiserfs_quota_on_mount(s, i); + if (ret < 0) + reiserfs_warning(s, "reiserfs-2500", + "cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + done = 0; + REISERFS_SB(s)->s_is_unlinked_ok = 1; + while (!retval) { + int depth; + retval = search_item(s, &max_cpu_key, &path); + if (retval != ITEM_NOT_FOUND) { + reiserfs_error(s, "vs-2140", + "search_by_key returned %d", retval); + break; + } + + bh = get_last_bh(&path); + item_pos = get_item_pos(&path); + if (item_pos != B_NR_ITEMS(bh)) { + reiserfs_warning(s, "vs-2060", + "wrong position found"); + break; + } + item_pos--; + ih = item_head(bh, item_pos); + + if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) + /* there are no "save" links anymore */ + break; + + save_link_key = ih->ih_key; + if (is_indirect_le_ih(ih)) + truncate = 1; + else + truncate = 0; + + /* reiserfs_iget needs k_dirid and k_objectid only */ + item = ih_item_body(bh, ih); + obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); + obj_key.on_disk_key.k_objectid = + le32_to_cpu(ih->ih_key.k_objectid); + obj_key.on_disk_key.k_offset = 0; + obj_key.on_disk_key.k_type = 0; + + pathrelse(&path); + + inode = reiserfs_iget(s, &obj_key); + if (!inode) { + /* + * the unlink almost completed, it just did not + * manage to remove "save" link and release objectid + */ + reiserfs_warning(s, "vs-2180", "iget failed for %K", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 1); + continue; + } + + if (!truncate && inode->i_nlink) { + /* file is not unlinked */ + reiserfs_warning(s, "vs-2185", + "file %K is not unlinked", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 0); + continue; + } + depth = reiserfs_write_unlock_nested(inode->i_sb); + dquot_initialize(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + + if (truncate && S_ISDIR(inode->i_mode)) { + /* + * We got a truncate request for a dir which + * is impossible. The only imaginable way is to + * execute unfinished truncate request then boot + * into old kernel, remove the file and create dir + * with the same key. + */ + reiserfs_warning(s, "green-2101", + "impossible truncate on a " + "directory %k. Please report", + INODE_PKEY(inode)); + retval = remove_save_link_only(s, &save_link_key, 0); + truncate = 0; + iput(inode); + continue; + } + + if (truncate) { + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + /* + * not completed truncate found. New size was + * committed together with "save" link + */ + reiserfs_info(s, "Truncating %k to %lld ..", + INODE_PKEY(inode), inode->i_size); + + /* don't update modification time */ + reiserfs_truncate_file(inode, 0); + + retval = remove_save_link(inode, truncate); + } else { + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + /* not completed unlink (rmdir) found */ + reiserfs_info(s, "Removing %k..", INODE_PKEY(inode)); + if (memcmp(&last_inode_key, INODE_PKEY(inode), + sizeof(last_inode_key))){ + last_inode_key = *INODE_PKEY(inode); + /* removal gets completed in iput */ + retval = 0; + } else { + reiserfs_warning(s, "super-2189", "Dead loop " + "in finish_unfinished " + "detected, just remove " + "save link\n"); + retval = remove_save_link_only(s, + &save_link_key, 0); + } + } + + iput(inode); + printk("done\n"); + done++; + } + REISERFS_SB(s)->s_is_unlinked_ok = 0; + +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + reiserfs_write_unlock(s); + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + if (sb_dqopt(s)->files[i] && quota_enabled[i]) + dquot_quota_off(s, i); + } + reiserfs_write_lock(s); + if (ms_active_set) + /* Restore the flag back */ + s->s_flags &= ~MS_ACTIVE; +#endif + pathrelse(&path); + if (done) + reiserfs_info(s, "There were %d uncompleted unlinks/truncates. " + "Completed\n", done); + return retval; +} + +/* + * to protect file being unlinked from getting lost we "safe" link files + * being unlinked. This link will be deleted in the same transaction with last + * item of file. mounting the filesystem we scan all these links and remove + * files which almost got lost + */ +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate) +{ + INITIALIZE_PATH(path); + int retval; + struct cpu_key key; + struct item_head ih; + __le32 link; + + BUG_ON(!th->t_trans_id); + + /* file can only get one "save link" of each kind */ + RFALSE(truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask), + "saved link already exists for truncated inode %lx", + (long)inode->i_ino); + RFALSE(!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask), + "saved link already exists for unlinked inode %lx", + (long)inode->i_ino); + + /* setup key of "save" link */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; + key.on_disk_key.k_objectid = inode->i_ino; + if (!truncate) { + /* unlink, rmdir, rename */ + set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize); + set_cpu_key_k_type(&key, TYPE_DIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, + 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, + 4 /*length */ , 0xffff /*free space */ ); + } else { + /* truncate */ + if (S_ISDIR(inode->i_mode)) + reiserfs_warning(inode->i_sb, "green-2102", + "Adding a truncate savelink for " + "a directory %k! Please report", + INODE_PKEY(inode)); + set_cpu_key_k_offset(&key, 1); + set_cpu_key_k_type(&key, TYPE_INDIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT, + 4 /*length */ , 0 /*free space */ ); + } + key.key_length = 3; + + /* look for its place in the tree */ + retval = search_item(inode->i_sb, &key, &path); + if (retval != ITEM_NOT_FOUND) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2100", + "search_by_key (%K) returned %d", &key, + retval); + pathrelse(&path); + return; + } + + /* body of "save" link */ + link = INODE_PKEY(inode)->k_dir_id; + + /* put "save" link into tree, don't charge quota to anyone */ + retval = + reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); + if (retval) { + if (retval != -ENOSPC) + reiserfs_error(inode->i_sb, "vs-2120", + "insert_item returned %d", retval); + } else { + if (truncate) + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + else + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + } +} + +/* this opens transaction unlike add_save_link */ +int remove_save_link(struct inode *inode, int truncate) +{ + struct reiserfs_transaction_handle th; + struct reiserfs_key key; + int err; + + /* we are going to do one balancing only */ + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + /* setup key of "save" link */ + key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID); + key.k_objectid = INODE_PKEY(inode)->k_objectid; + if (!truncate) { + /* unlink, rmdir, rename */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, + 1 + inode->i_sb->s_blocksize); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT); + } else { + /* truncate */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT); + } + + if ((truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) || + (!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask))) + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item(&th, NULL, &key); + if (!truncate) { + reiserfs_release_objectid(&th, inode->i_ino); + REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask; + } else + REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; + + return journal_end(&th); +} + +static void reiserfs_kill_sb(struct super_block *s) +{ + if (REISERFS_SB(s)) { + reiserfs_proc_info_done(s); + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; + } + + kill_block_super(s); +} + +static void reiserfs_put_super(struct super_block *s) +{ + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + reiserfs_write_lock(s); + + /* + * change file system state to current state if it was mounted + * with read-write permissions + */ + if (!(s->s_flags & MS_RDONLY)) { + if (!journal_begin(&th, s, 10)) { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), + REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + } + } + + /* + * note, journal_release checks for readonly mount, and can + * decide not to do a journal_end + */ + journal_release(&th, s); + + reiserfs_free_bitmap_cache(s); + + brelse(SB_BUFFER_WITH_SB(s)); + + print_statistics(s); + + if (REISERFS_SB(s)->reserved_blocks != 0) { + reiserfs_warning(s, "green-2005", "reserved blocks left %d", + REISERFS_SB(s)->reserved_blocks); + } + + reiserfs_write_unlock(s); + mutex_destroy(&REISERFS_SB(s)->lock); + destroy_workqueue(REISERFS_SB(s)->commit_wq); + kfree(s->s_fs_info); + s->s_fs_info = NULL; +} + +static struct kmem_cache *reiserfs_inode_cachep; + +static struct inode *reiserfs_alloc_inode(struct super_block *sb) +{ + struct reiserfs_inode_info *ei; + ei = (struct reiserfs_inode_info *) + kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + atomic_set(&ei->openers, 0); + mutex_init(&ei->tailpack); +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + + return &ei->vfs_inode; +} + +static void reiserfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); +} + +static void reiserfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, reiserfs_i_callback); +} + +static void init_once(void *foo) +{ + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; + + INIT_LIST_HEAD(&ei->i_prealloc_list); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_inodecache(void) +{ + reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", + sizeof(struct + reiserfs_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (reiserfs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(reiserfs_inode_cachep); +} + +/* we don't mark inodes dirty, we just log them */ +static void reiserfs_dirty_inode(struct inode *inode, int flags) +{ + struct reiserfs_transaction_handle th; + + int err = 0; + + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning(inode->i_sb, "clm-6006", + "writing inode %lu on readonly FS", + inode->i_ino); + return; + } + reiserfs_write_lock(inode->i_sb); + + /* + * this is really only used for atime updates, so they don't have + * to be included in O_SYNC or fsync + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) + goto out; + + reiserfs_update_sd(&th, inode); + journal_end(&th); + +out: + reiserfs_write_unlock(inode->i_sb); +} + +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *s = root->d_sb; + struct reiserfs_journal *journal = SB_JOURNAL(s); + long opts = REISERFS_SB(s)->s_mount_opt; + + if (opts & (1 << REISERFS_LARGETAIL)) + seq_puts(seq, ",tails=on"); + else if (!(opts & (1 << REISERFS_SMALLTAIL))) + seq_puts(seq, ",notail"); + /* tails=small is default so we don't show it */ + + if (!(opts & (1 << REISERFS_BARRIER_FLUSH))) + seq_puts(seq, ",barrier=none"); + /* barrier=flush is default so we don't show it */ + + if (opts & (1 << REISERFS_ERROR_CONTINUE)) + seq_puts(seq, ",errors=continue"); + else if (opts & (1 << REISERFS_ERROR_PANIC)) + seq_puts(seq, ",errors=panic"); + /* errors=ro is default so we don't show it */ + + if (opts & (1 << REISERFS_DATA_LOG)) + seq_puts(seq, ",data=journal"); + else if (opts & (1 << REISERFS_DATA_WRITEBACK)) + seq_puts(seq, ",data=writeback"); + /* data=ordered is default so we don't show it */ + + if (opts & (1 << REISERFS_ATTRS)) + seq_puts(seq, ",attrs"); + + if (opts & (1 << REISERFS_XATTRS_USER)) + seq_puts(seq, ",user_xattr"); + + if (opts & (1 << REISERFS_EXPOSE_PRIVROOT)) + seq_puts(seq, ",expose_privroot"); + + if (opts & (1 << REISERFS_POSIXACL)) + seq_puts(seq, ",acl"); + + if (REISERFS_SB(s)->s_jdev) + seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + + if (journal->j_max_commit_age != journal->j_default_max_commit_age) + seq_printf(seq, ",commit=%d", journal->j_max_commit_age); + +#ifdef CONFIG_QUOTA + if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + else if (opts & (1 << REISERFS_USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + else if (opts & (1 << REISERFS_GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (REISERFS_SB(s)->s_jquota_fmt) { + if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD) + seq_puts(seq, ",jqfmt=vfsold"); + else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0) + seq_puts(seq, ",jqfmt=vfsv0"); + } +#endif + + /* Block allocator options */ + if (opts & (1 << REISERFS_NO_BORDER)) + seq_puts(seq, ",block-allocator=noborder"); + if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=no_unhashed_relocation"); + if (opts & (1 << REISERFS_HASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=hashed_relocation"); + if (opts & (1 << REISERFS_TEST4)) + seq_puts(seq, ",block-allocator=test4"); + show_alloc_options(seq, s); + return 0; +} + +#ifdef CONFIG_QUOTA +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, + size_t, loff_t); +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, + loff_t); + +static struct dquot **reiserfs_get_dquots(struct inode *inode) +{ + return REISERFS_I(inode)->i_dquot; +} +#endif + +static const struct super_operations reiserfs_sops = { + .alloc_inode = reiserfs_alloc_inode, + .destroy_inode = reiserfs_destroy_inode, + .write_inode = reiserfs_write_inode, + .dirty_inode = reiserfs_dirty_inode, + .evict_inode = reiserfs_evict_inode, + .put_super = reiserfs_put_super, + .sync_fs = reiserfs_sync_fs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, + .statfs = reiserfs_statfs, + .remount_fs = reiserfs_remount, + .show_options = reiserfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = reiserfs_quota_read, + .quota_write = reiserfs_quota_write, + .get_dquots = reiserfs_get_dquots, +#endif +}; + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") + +static int reiserfs_write_dquot(struct dquot *); +static int reiserfs_acquire_dquot(struct dquot *); +static int reiserfs_release_dquot(struct dquot *); +static int reiserfs_mark_dquot_dirty(struct dquot *); +static int reiserfs_write_info(struct super_block *, int); +static int reiserfs_quota_on(struct super_block *, int, int, struct path *); + +static const struct dquot_operations reiserfs_quota_operations = { + .write_dquot = reiserfs_write_dquot, + .acquire_dquot = reiserfs_acquire_dquot, + .release_dquot = reiserfs_release_dquot, + .mark_dirty = reiserfs_mark_dquot_dirty, + .write_info = reiserfs_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops reiserfs_qctl_operations = { + .quota_on = reiserfs_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#endif + +static const struct export_operations reiserfs_export_ops = { + .encode_fh = reiserfs_encode_fh, + .fh_to_dentry = reiserfs_fh_to_dentry, + .fh_to_parent = reiserfs_fh_to_parent, + .get_parent = reiserfs_get_parent, +}; + +/* + * this struct is used in reiserfs_getopt () for containing the value for + * those mount options that have values rather than being toggles. + */ +typedef struct { + char *value; + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; +} arg_desc_t; + +/* Set this bit in arg_required to allow empty arguments */ +#define REISERFS_OPT_ALLOWEMPTY 31 + +/* + * this struct is used in reiserfs_getopt() for describing the + * set of reiserfs mount options + */ +typedef struct { + char *option_name; + + /* 0 if argument is not required, not 0 otherwise */ + int arg_required; + + /* list of values accepted by an option */ + const arg_desc_t *values; + + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; +} opt_desc_t; + +/* possible values for -o data= */ +static const arg_desc_t logging_mode[] = { + {"ordered", 1 << REISERFS_DATA_ORDERED, + (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)}, + {"journal", 1 << REISERFS_DATA_LOG, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)}, + {"writeback", 1 << REISERFS_DATA_WRITEBACK, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)}, + {.value = NULL} +}; + +/* possible values for -o barrier= */ +static const arg_desc_t barrier_mode[] = { + {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH}, + {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE}, + {.value = NULL} +}; + +/* + * possible values for "-o block-allocator=" and bits which are to be set in + * s_mount_opt of reiserfs specific part of in-core super block + */ +static const arg_desc_t balloc[] = { + {"noborder", 1 << REISERFS_NO_BORDER, 0}, + {"border", 0, 1 << REISERFS_NO_BORDER}, + {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0}, + {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0}, + {"test4", 1 << REISERFS_TEST4, 0}, + {"notest4", 0, 1 << REISERFS_TEST4}, + {NULL, 0, 0} +}; + +static const arg_desc_t tails[] = { + {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL}, + {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL}, + {NULL, 0, 0} +}; + +static const arg_desc_t error_actions[] = { + {"panic", 1 << REISERFS_ERROR_PANIC, + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, + {"ro-remount", 1 << REISERFS_ERROR_RO, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, +#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG + {"continue", 1 << REISERFS_ERROR_CONTINUE, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, +#endif + {NULL, 0, 0}, +}; + +/* + * proceed only one option from a list *cur - string containing of mount + * options + * opts - array of options which are accepted + * opt_arg - if option is found and requires an argument and if it is specifed + * in the input - pointer to the argument is stored here + * bit_flags - if option requires to set a certain bit - it is set here + * return -1 if unknown option is found, opt->arg_required otherwise + */ +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, + char **opt_arg, unsigned long *bit_flags) +{ + char *p; + /* + * foo=bar, + * ^ ^ ^ + * | | +-- option_end + * | +-- arg_start + * +-- option_start + */ + const opt_desc_t *opt; + const arg_desc_t *arg; + + p = *cur; + + /* assume argument cannot contain commas */ + *cur = strchr(p, ','); + if (*cur) { + *(*cur) = '\0'; + (*cur)++; + } + + if (!strncmp(p, "alloc=", 6)) { + /* + * Ugly special case, probably we should redo options + * parser so that it can understand several arguments for + * some options, also so that it can fill several bitfields + * with option values. + */ + if (reiserfs_parse_alloc_options(s, p + 6)) { + return -1; + } else { + return 0; + } + } + + /* for every option in the list */ + for (opt = opts; opt->option_name; opt++) { + if (!strncmp(p, opt->option_name, strlen(opt->option_name))) { + if (bit_flags) { + if (opt->clrmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6500", + "%s not supported.\n", + p); + else + *bit_flags &= ~opt->clrmask; + if (opt->setmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "super-6501", + "%s not supported.\n", + p); + else + *bit_flags |= opt->setmask; + } + break; + } + } + if (!opt->option_name) { + reiserfs_warning(s, "super-6502", + "unknown mount option \"%s\"", p); + return -1; + } + + p += strlen(opt->option_name); + switch (*p) { + case '=': + if (!opt->arg_required) { + reiserfs_warning(s, "super-6503", + "the option \"%s\" does not " + "require an argument\n", + opt->option_name); + return -1; + } + break; + + case 0: + if (opt->arg_required) { + reiserfs_warning(s, "super-6504", + "the option \"%s\" requires an " + "argument\n", opt->option_name); + return -1; + } + break; + default: + reiserfs_warning(s, "super-6505", + "head of option \"%s\" is only correct\n", + opt->option_name); + return -1; + } + + /* + * move to the argument, or to next option if argument is not + * required + */ + p++; + + if (opt->arg_required + && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) + && !strlen(p)) { + /* this catches "option=," if not allowed */ + reiserfs_warning(s, "super-6506", + "empty argument for \"%s\"\n", + opt->option_name); + return -1; + } + + if (!opt->values) { + /* *=NULLopt_arg contains pointer to argument */ + *opt_arg = p; + return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY); + } + + /* values possible for this option are listed in opt->values */ + for (arg = opt->values; arg->value; arg++) { + if (!strcmp(p, arg->value)) { + if (bit_flags) { + *bit_flags &= ~arg->clrmask; + *bit_flags |= arg->setmask; + } + return opt->arg_required; + } + } + + reiserfs_warning(s, "super-6506", + "bad value \"%s\" for option \"%s\"\n", p, + opt->option_name); + return -1; +} + +/* returns 0 if something is wrong in option string, 1 - otherwise */ +static int reiserfs_parse_options(struct super_block *s, + + /* string given via mount's -o */ + char *options, + + /* + * after the parsing phase, contains the + * collection of bitflags defining what + * mount options were selected. + */ + unsigned long *mount_options, + + /* strtol-ed from NNN of resize=NNN */ + unsigned long *blocks, + char **jdev_name, + unsigned int *commit_max_age, + char **qf_names, + unsigned int *qfmt) +{ + int c; + char *arg = NULL; + char *pos; + opt_desc_t opts[] = { + /* + * Compatibility stuff, so that -o notail for old + * setups still work + */ + {"tails",.arg_required = 't',.values = tails}, + {"notail",.clrmask = + (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"conv",.setmask = 1 << REISERFS_CONVERT}, + {"attrs",.setmask = 1 << REISERFS_ATTRS}, + {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, + {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, +#ifdef CONFIG_REISERFS_FS_XATTR + {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, + {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, +#else + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, +#else + {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, +#endif + {.option_name = "nolog"}, + {"replayonly",.setmask = 1 << REPLAYONLY}, + {"block-allocator",.arg_required = 'a',.values = balloc}, + {"data",.arg_required = 'd',.values = logging_mode}, + {"barrier",.arg_required = 'b',.values = barrier_mode}, + {"resize",.arg_required = 'r',.values = NULL}, + {"jdev",.arg_required = 'j',.values = NULL}, + {"nolargeio",.arg_required = 'w',.values = NULL}, + {"commit",.arg_required = 'c',.values = NULL}, + {"usrquota",.setmask = 1 << REISERFS_USRQUOTA}, + {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA}, + {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA}, + {"errors",.arg_required = 'e',.values = error_actions}, + {"usrjquota",.arg_required = + 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"grpjquota",.arg_required = + 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"jqfmt",.arg_required = 'f',.values = NULL}, + {.option_name = NULL} + }; + + *blocks = 0; + if (!options || !*options) + /* + * use default configuration: create tails, journaling on, no + * conversion to newest format + */ + return 1; + + for (pos = options; pos;) { + c = reiserfs_getopt(s, &pos, opts, &arg, mount_options); + if (c == -1) + /* wrong option is given */ + return 0; + + if (c == 'r') { + char *p; + + p = NULL; + /* "resize=NNN" or "resize=auto" */ + + if (!strcmp(arg, "auto")) { + /* From JFS code, to auto-get the size. */ + *blocks = + s->s_bdev->bd_inode->i_size >> s-> + s_blocksize_bits; + } else { + *blocks = simple_strtoul(arg, &p, 0); + if (*p != '\0') { + /* NNN does not look like a number */ + reiserfs_warning(s, "super-6507", + "bad value %s for " + "-oresize\n", arg); + return 0; + } + } + } + + if (c == 'c') { + char *p = NULL; + unsigned long val = simple_strtoul(arg, &p, 0); + /* commit=NNN (time in seconds) */ + if (*p != '\0' || val >= (unsigned int)-1) { + reiserfs_warning(s, "super-6508", + "bad value %s for -ocommit\n", + arg); + return 0; + } + *commit_max_age = (unsigned int)val; + } + + if (c == 'w') { + reiserfs_warning(s, "super-6509", "nolargeio option " + "is no longer supported"); + return 0; + } + + if (c == 'j') { + if (arg && *arg && jdev_name) { + /* Hm, already assigned? */ + if (*jdev_name) { + reiserfs_warning(s, "super-6510", + "journal device was " + "already specified to " + "be %s", *jdev_name); + return 0; + } + *jdev_name = arg; + } + } +#ifdef CONFIG_QUOTA + if (c == 'u' || c == 'g') { + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; + + if (sb_any_quota_loaded(s) && + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { + reiserfs_warning(s, "super-6511", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + if (*arg) { /* Some filename specified? */ + if (REISERFS_SB(s)->s_qf_names[qtype] + && strcmp(REISERFS_SB(s)->s_qf_names[qtype], + arg)) { + reiserfs_warning(s, "super-6512", + "%s quota file " + "already specified.", + QTYPE2NAME(qtype)); + return 0; + } + if (strchr(arg, '/')) { + reiserfs_warning(s, "super-6513", + "quotafile must be " + "on filesystem root."); + return 0; + } + qf_names[qtype] = kstrdup(arg, GFP_KERNEL); + if (!qf_names[qtype]) { + reiserfs_warning(s, "reiserfs-2502", + "not enough memory " + "for storing " + "quotafile name."); + return 0; + } + if (qtype == USRQUOTA) + *mount_options |= 1 << REISERFS_USRQUOTA; + else + *mount_options |= 1 << REISERFS_GRPQUOTA; + } else { + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; + if (qtype == USRQUOTA) + *mount_options &= ~(1 << REISERFS_USRQUOTA); + else + *mount_options &= ~(1 << REISERFS_GRPQUOTA); + } + } + if (c == 'f') { + if (!strcmp(arg, "vfsold")) + *qfmt = QFMT_VFS_OLD; + else if (!strcmp(arg, "vfsv0")) + *qfmt = QFMT_VFS_V0; + else { + reiserfs_warning(s, "super-6514", + "unknown quota format " + "specified."); + return 0; + } + if (sb_any_quota_loaded(s) && + *qfmt != REISERFS_SB(s)->s_jquota_fmt) { + reiserfs_warning(s, "super-6515", + "cannot change journaled " + "quota options when quota " + "turned on."); + return 0; + } + } +#else + if (c == 'u' || c == 'g' || c == 'f') { + reiserfs_warning(s, "reiserfs-2503", "journaled " + "quota options not supported."); + return 0; + } +#endif + } + +#ifdef CONFIG_QUOTA + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { + reiserfs_warning(s, "super-6515", + "journaled quota format not specified."); + return 0; + } + if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) && + sb_has_quota_loaded(s, USRQUOTA)) || + (!(*mount_options & (1 << REISERFS_GRPQUOTA)) && + sb_has_quota_loaded(s, GRPQUOTA))) { + reiserfs_warning(s, "super-6516", "quota options must " + "be present when quota is turned on."); + return 0; + } +#endif + + return 1; +} + +static void switch_data_mode(struct super_block *s, unsigned long mode) +{ + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); +} + +static void handle_data_mode(struct super_block *s, unsigned long mount_options) +{ + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + reiserfs_info(s, "switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + reiserfs_info(s, "switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + reiserfs_info(s, "switching to writeback data mode\n"); + } + } +} + +static void handle_barrier_mode(struct super_block *s, unsigned long bits) +{ + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + +static void handle_attrs(struct super_block *s) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + + if (reiserfs_attrs(s)) { + if (old_format_only(s)) { + reiserfs_warning(s, "super-6517", "cannot support " + "attributes on 3.5.x disk format"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + return; + } + if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { + reiserfs_warning(s, "super-6518", "cannot support " + "attributes until flag is set in " + "super-block"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); + } + } +} + +#ifdef CONFIG_QUOTA +static void handle_quota_files(struct super_block *s, char **qf_names, + unsigned int *qfmt) +{ + int i; + + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; + } + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; +} +#endif + +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) +{ + struct reiserfs_super_block *rs; + struct reiserfs_transaction_handle th; + unsigned long blocks; + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; + unsigned long safe_mask = 0; + unsigned int commit_max_age = (unsigned int)-1; + struct reiserfs_journal *journal = SB_JOURNAL(s); + char *new_opts = kstrdup(arg, GFP_KERNEL); + int err; + char *qf_names[REISERFS_MAXQUOTAS]; + unsigned int qfmt = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + + sync_filesystem(s); + reiserfs_write_lock(s); + +#ifdef CONFIG_QUOTA + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); +#endif + + rs = SB_DISK_SUPER_BLOCK(s); + + if (!reiserfs_parse_options + (s, arg, &mount_options, &blocks, NULL, &commit_max_age, + qf_names, &qfmt)) { +#ifdef CONFIG_QUOTA + for (i = 0; i < REISERFS_MAXQUOTAS; i++) + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(qf_names[i]); +#endif + err = -EINVAL; + goto out_err_unlock; + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + handle_attrs(s); + + /* Add options that are safe here */ + safe_mask |= 1 << REISERFS_SMALLTAIL; + safe_mask |= 1 << REISERFS_LARGETAIL; + safe_mask |= 1 << REISERFS_NO_BORDER; + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; + safe_mask |= 1 << REISERFS_HASHED_RELOCATION; + safe_mask |= 1 << REISERFS_TEST4; + safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; + safe_mask |= 1 << REISERFS_ERROR_RO; + safe_mask |= 1 << REISERFS_ERROR_CONTINUE; + safe_mask |= 1 << REISERFS_ERROR_PANIC; + safe_mask |= 1 << REISERFS_USRQUOTA; + safe_mask |= 1 << REISERFS_GRPQUOTA; + + /* + * Update the bitmask, taking care to keep + * the bits we're not allowed to change here + */ + REISERFS_SB(s)->s_mount_opt = + (REISERFS_SB(s)-> + s_mount_opt & ~safe_mask) | (mount_options & safe_mask); + + if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } else if (commit_max_age == 0) { + /* 0 means restore defaults. */ + journal->j_max_commit_age = journal->j_default_max_commit_age; + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + } + + if (blocks) { + err = reiserfs_resize(s, blocks); + if (err != 0) + goto out_err_unlock; + } + + if (*mount_flags & MS_RDONLY) { + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + /* remount read-only */ + if (s->s_flags & MS_RDONLY) + /* it is read-only already */ + goto out_ok_unlocked; + + err = dquot_suspend(s, -1); + if (err < 0) + goto out_err; + + /* try to remount file system with read-only permissions */ + if (sb_umount_state(rs) == REISERFS_VALID_FS + || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { + goto out_ok_unlocked; + } + + reiserfs_write_lock(s); + + err = journal_begin(&th, s, 10); + if (err) + goto out_err_unlock; + + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + } else { + /* remount read-write */ + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + goto out_ok_unlocked; /* We are read-write already */ + } + + if (reiserfs_is_journal_aborted(journal)) { + err = journal->j_errno; + goto out_err_unlock; + } + + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + + /* now it is safe to call journal_begin */ + s->s_flags &= ~MS_RDONLY; + err = journal_begin(&th, s, 10); + if (err) + goto out_err_unlock; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; + set_sb_umount_state(rs, REISERFS_ERROR_FS); + if (!old_format_only(s)) + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1; + err = journal_end(&th); + if (err) + goto out_err_unlock; + + reiserfs_write_unlock(s); + if (!(*mount_flags & MS_RDONLY)) { + dquot_resume(s, -1); + reiserfs_write_lock(s); + finish_unfinished(s); + reiserfs_write_unlock(s); + reiserfs_xattr_init(s, *mount_flags); + } + +out_ok_unlocked: + replace_mount_options(s, new_opts); + return 0; + +out_err_unlock: + reiserfs_write_unlock(s); +out_err: + kfree(new_opts); + return err; +} + +static int read_super_block(struct super_block *s, int offset) +{ + struct buffer_head *bh; + struct reiserfs_super_block *rs; + int fs_blocksize; + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2006", + "bread failed (dev %s, block %lu, size %lu)", + s->s_id, offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_any_reiserfs_magic_string(rs)) { + brelse(bh); + return 1; + } + /* + * ok, reiserfs signature (old or new) found in at the given offset + */ + fs_blocksize = sb_blocksize(rs); + brelse(bh); + sb_set_blocksize(s, fs_blocksize); + + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2007", + "bread failed (dev %s, block %lu, size %lu)", + s->s_id, offset / s->s_blocksize, + s->s_blocksize); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (sb_blocksize(rs) != s->s_blocksize) { + reiserfs_warning(s, "sh-2011", "can't find a reiserfs " + "filesystem on (dev %s, block %llu, size %lu)", + s->s_id, + (unsigned long long)bh->b_blocknr, + s->s_blocksize); + brelse(bh); + return 1; + } + + if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { + brelse(bh); + reiserfs_warning(s, "super-6519", "Unfinished reiserfsck " + "--rebuild-tree run detected. Please run\n" + "reiserfsck --rebuild-tree and wait for a " + "completion. If that fails\n" + "get newer reiserfsprogs package"); + return 1; + } + + SB_BUFFER_WITH_SB(s) = bh; + SB_DISK_SUPER_BLOCK(s) = rs; + + /* + * magic is of non-standard journal filesystem, look at s_version to + * find which format is in use + */ + if (is_reiserfs_jr(rs)) { + if (sb_version(rs) == REISERFS_VERSION_2) + reiserfs_info(s, "found reiserfs format \"3.6\"" + " with non-standard journal\n"); + else if (sb_version(rs) == REISERFS_VERSION_1) + reiserfs_info(s, "found reiserfs format \"3.5\"" + " with non-standard journal\n"); + else { + reiserfs_warning(s, "sh-2012", "found unknown " + "format \"%u\" of reiserfs with " + "non-standard magic", sb_version(rs)); + return 1; + } + } else + /* + * s_version of standard format may contain incorrect + * information, so we just look at the magic string + */ + reiserfs_info(s, + "found reiserfs format \"%s\" with standard journal\n", + is_reiserfs_3_5(rs) ? "3.5" : "3.6"); + + s->s_op = &reiserfs_sops; + s->s_export_op = &reiserfs_export_ops; +#ifdef CONFIG_QUOTA + s->s_qcop = &reiserfs_qctl_operations; + s->dq_op = &reiserfs_quota_operations; + s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + + /* + * new format is limited by the 32 bit wide i_blocks field, want to + * be one full block below that. + */ + s->s_maxbytes = (512LL << 32) - s->s_blocksize; + return 0; +} + +/* after journal replay, reread all bitmap and super blocks */ +static int reread_meta_blocks(struct super_block *s) +{ + ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); + wait_on_buffer(SB_BUFFER_WITH_SB(s)); + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + reiserfs_warning(s, "reiserfs-2504", "error reading the super"); + return 1; + } + + return 0; +} + +/* hash detection stuff */ + +/* + * if root directory is empty - we set default - Yura's - hash and + * warn about it + * FIXME: we look for only one name in a directory. If tea and yura + * both have the same value - we ask user to send report to the + * mailing list + */ +static __u32 find_hash_out(struct super_block *s) +{ + int retval; + struct inode *inode; + struct cpu_key key; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + struct reiserfs_de_head *deh; + __u32 hash = DEFAULT_HASH; + __u32 deh_hashval, teahash, r5hash, yurahash; + + inode = d_inode(s->s_root); + + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + deh = de.de_deh + de.de_entry_num; + + if (deh_offset(deh) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) + hash = YURA_HASH; + reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n"); + goto out; + } + + deh_hashval = GET_HASH_VALUE(deh_offset(deh)); + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + + if ((teahash == r5hash && deh_hashval == r5hash) || + (teahash == yurahash && deh_hashval == yurahash) || + (r5hash == yurahash && deh_hashval == yurahash)) { + reiserfs_warning(s, "reiserfs-2506", + "Unable to automatically detect hash " + "function. Please mount with -o " + "hash={tea,rupasov,r5}"); + hash = UNSET_HASH; + goto out; + } + + if (deh_hashval == yurahash) + hash = YURA_HASH; + else if (deh_hashval == teahash) + hash = TEA_HASH; + else if (deh_hashval == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "reiserfs-2506", + "Unrecognised hash function"); + hash = UNSET_HASH; + } +out: + pathrelse(&path); + return hash; +} + +/* finds out which hash names are sorted with */ +static int what_hash(struct super_block *s) +{ + __u32 code; + + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); + + /* + * reiserfs_hash_detect() == true if any of the hash mount options + * were used. We must check them to make sure the user isn't + * using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out(s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* + * detection has found the hash, and we must check against the + * mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + reiserfs_warning(s, "reiserfs-2507", + "Error, %s hash detected, " + "unable to force rupasov hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + reiserfs_warning(s, "reiserfs-2508", + "Error, %s hash detected, " + "unable to force tea hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + reiserfs_warning(s, "reiserfs-2509", + "Error, %s hash detected, " + "unable to force r5 hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } + } else { + /* + * find_hash_out was not called or + * could not determine the hash + */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH; + } + } + + /* + * if we are mounted RW, and we have a new valid hash code, update + * the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); + } + return code; +} + +/* return pointer to appropriate function */ +static hashf_t hash_function(struct super_block *s) +{ + switch (what_hash(s)) { + case TEA_HASH: + reiserfs_info(s, "Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_info(s, "Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_info(s, "Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; +} + +/* this is used to set up correct value for old partitions */ +static int function2code(hashf_t func) +{ + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; + + BUG(); /* should never happen */ + + return 0; +} + +#define SWARN(silent, s, id, ...) \ + if (!(silent)) \ + reiserfs_warning(s, id, __VA_ARGS__) + +static int reiserfs_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct reiserfs_transaction_handle th; + int old_format = 0; + unsigned long blocks; + unsigned int commit_max_age = 0; + int jinit_done = 0; + struct reiserfs_iget_args args; + struct reiserfs_super_block *rs; + char *jdev_name; + struct reiserfs_sb_info *sbi; + int errval = -EINVAL; + char *qf_names[REISERFS_MAXQUOTAS] = {}; + unsigned int qfmt = 0; + + save_mount_options(s, data); + + sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + s->s_fs_info = sbi; + /* Set default values for options: non-aggressive tails, RO on errors */ + sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); + sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); + /* no preallocation minimum, be smart in reiserfs_file_write instead */ + sbi->s_alloc_options.preallocmin = 0; + /* Preallocate by 16 blocks (17-1) at once */ + sbi->s_alloc_options.preallocsize = 17; + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + + spin_lock_init(&sbi->old_work_lock); + INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits); + mutex_init(&sbi->lock); + sbi->lock_depth = -1; + + sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0, + s->s_id); + if (!sbi->commit_wq) { + SWARN(silent, s, "", "Cannot allocate commit workqueue"); + errval = -ENOMEM; + goto error_unlocked; + } + + jdev_name = NULL; + if (reiserfs_parse_options + (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name, + &commit_max_age, qf_names, &qfmt) == 0) { + goto error_unlocked; + } + if (jdev_name && jdev_name[0]) { + sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL); + if (!sbi->s_jdev) { + SWARN(silent, s, "", "Cannot allocate memory for " + "journal device name"); + goto error; + } + } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif + + if (blocks) { + SWARN(silent, s, "jmacd-7", "resize option for remount only"); + goto error_unlocked; + } + + /* + * try old format (undistributed bitmap, super block in 8-th 1k + * block of a device) + */ + if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) + old_format = 1; + + /* + * try new format (64-th 1k block), which can contain reiserfs + * super block + */ + else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { + SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", + s->s_id); + goto error_unlocked; + } + + rs = SB_DISK_SUPER_BLOCK(s); + /* + * Let's do basic sanity check to verify that underlying device is not + * smaller than the filesystem. If the check fails then abort and + * scream, because bad stuff will happen otherwise. + */ + if (s->s_bdev && s->s_bdev->bd_inode + && i_size_read(s->s_bdev->bd_inode) < + sb_block_count(rs) * sb_blocksize(rs)) { + SWARN(silent, s, "", "Filesystem cannot be " + "mounted because it is bigger than the device"); + SWARN(silent, s, "", "You may need to run fsck " + "or increase size of your LVM partition"); + SWARN(silent, s, "", "Or may be you forgot to " + "reboot after fdisk when it told you to"); + goto error_unlocked; + } + + sbi->s_mount_state = SB_REISERFS_STATE(s); + sbi->s_mount_state = REISERFS_VALID_FS; + + if ((errval = reiserfs_init_bitmap_cache(s))) { + SWARN(silent, s, "jmacd-8", "unable to read bitmap"); + goto error_unlocked; + } + + errval = -EINVAL; +#ifdef CONFIG_REISERFS_CHECK + SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); + SWARN(silent, s, "", "- it is slow mode for debugging."); +#endif + + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) { + sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + reiserfs_info(s, "using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + reiserfs_info(s, "using ordered data mode\n"); + } else { + reiserfs_info(s, "using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, "sh-2022", + "unable to initialize journal space"); + goto error_unlocked; + } else { + /* + * once this is set, journal_release must be called + * if we error out of the mount + */ + jinit_done = 1; + } + + if (reread_meta_blocks(s)) { + SWARN(silent, s, "jmacd-9", + "unable to reread meta blocks after journal init"); + goto error_unlocked; + } + + if (replay_only(s)) + goto error_unlocked; + + if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + SWARN(silent, s, "clm-7000", + "Detected readonly device, marking FS readonly"); + s->s_flags |= MS_RDONLY; + } + args.objectid = REISERFS_ROOT_OBJECTID; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID; + root_inode = + iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, + reiserfs_init_locked_inode, (void *)&args); + if (!root_inode) { + SWARN(silent, s, "jmacd-10", "get root inode failed"); + goto error_unlocked; + } + + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) + goto error; + /* define and initialize hash function */ + sbi->s_hash_function = hash_function(s); + if (sbi->s_hash_function == NULL) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if (is_reiserfs_3_5(rs) + || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) + set_bit(REISERFS_3_5, &sbi->s_properties); + else if (old_format) + set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties); + else + set_bit(REISERFS_3_6, &sbi->s_properties); + + if (!(s->s_flags & MS_RDONLY)) { + + errval = journal_begin(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + + set_sb_umount_state(rs, REISERFS_ERROR_FS); + set_sb_fs_state(rs, 0); + + /* + * Clear out s_bmap_nr if it would wrap. We can handle this + * case, but older revisions can't. This will cause the + * file system to fail mount on those older implementations, + * avoiding corruption. -jeffm + */ + if (bmap_would_wrap(reiserfs_bmap_count(s)) && + sb_bmap_nr(rs) != 0) { + reiserfs_warning(s, "super-2030", "This file system " + "claims to use %u bitmap blocks in " + "its super block, but requires %u. " + "Clearing to zero.", sb_bmap_nr(rs), + reiserfs_bmap_count(s)); + + set_sb_bmap_nr(rs, 0); + } + + if (old_format_only(s)) { + /* + * filesystem of format 3.5 either with standard + * or non-standard journal + */ + if (convert_reiserfs(s)) { + /* and -o conv is given */ + if (!silent) + reiserfs_info(s, + "converting 3.5 filesystem to the 3.6 format"); + + if (is_reiserfs_3_5(rs)) + /* + * put magic string of 3.6 format. + * 2.2 will not be able to + * mount this filesystem anymore + */ + memcpy(rs->s_v1.s_magic, + reiserfs_3_6_magic_string, + sizeof + (reiserfs_3_6_magic_string)); + + set_sb_version(rs, REISERFS_VERSION_2); + reiserfs_convert_objectid_map_v1(s); + set_bit(REISERFS_3_6, &sbi->s_properties); + clear_bit(REISERFS_3_5, &sbi->s_properties); + } else if (!silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + } else + set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); + + + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + reiserfs_write_unlock(s); + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error_unlocked; + } + reiserfs_write_lock(s); + + /* + * look for files which were to be removed in previous session + */ + finish_unfinished(s); + } else { + if (old_format_only(s) && !silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + + reiserfs_write_unlock(s); + if ((errval = reiserfs_lookup_privroot(s)) || + (errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error_unlocked; + } + reiserfs_write_lock(s); + } + /* + * mark hash in super block: it could be unset. overwrite should be ok + */ + set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); + + handle_attrs(s); + + reiserfs_proc_info_init(s); + + init_waitqueue_head(&(sbi->s_wait)); + spin_lock_init(&sbi->bitmap_lock); + + reiserfs_write_unlock(s); + + return (0); + +error: + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); + journal_release_error(NULL, s); + reiserfs_write_unlock(s); + } + + if (sbi->commit_wq) + destroy_workqueue(sbi->commit_wq); + + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + + reiserfs_free_bitmap_cache(s); + if (SB_BUFFER_WITH_SB(s)) + brelse(SB_BUFFER_WITH_SB(s)); +#ifdef CONFIG_QUOTA + { + int j; + for (j = 0; j < REISERFS_MAXQUOTAS; j++) + kfree(qf_names[j]); + } +#endif + kfree(sbi); + + s->s_fs_info = NULL; + return errval; +} + +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb); + + buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); + buf->f_bfree = sb_free_blocks(rs); + buf->f_bavail = buf->f_bfree; + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; + buf->f_bsize = dentry->d_sb->s_blocksize; + /* changed to accommodate gcc folks. */ + buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, + sizeof(rs->s_uuid)/2); + + return 0; +} + +#ifdef CONFIG_QUOTA +static int reiserfs_write_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(dquot->dq_sb); + ret = dquot_commit(dquot); + reiserfs_write_lock_nested(dquot->dq_sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_acquire_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(dquot->dq_sb); + ret = dquot_acquire(dquot); + reiserfs_write_lock_nested(dquot->dq_sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; +} + +static int reiserfs_release_dquot(struct dquot *dquot) +{ + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + reiserfs_write_unlock(dquot->dq_sb); + if (ret) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + goto out; + } + ret = dquot_release(dquot); + reiserfs_write_lock(dquot->dq_sb); + err = journal_end(&th); + if (!ret && err) + ret = err; + reiserfs_write_unlock(dquot->dq_sb); +out: + return ret; +} + +static int reiserfs_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return reiserfs_write_dquot(dquot); + } else + return dquot_mark_dquot_dirty(dquot); +} + +static int reiserfs_write_info(struct super_block *sb, int type) +{ + struct reiserfs_transaction_handle th; + int ret, err; + int depth; + + /* Data block + inode block */ + reiserfs_write_lock(sb); + ret = journal_begin(&th, sb, 2); + if (ret) + goto out; + depth = reiserfs_write_unlock_nested(sb); + ret = dquot_commit_info(sb, type); + reiserfs_write_lock_nested(sb, depth); + err = journal_end(&th); + if (!ret && err) + ret = err; +out: + reiserfs_write_unlock(sb); + return ret; +} + +/* + * Turn on quotas during mount time - we need to find the quota file and such... + */ +static int reiserfs_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; + + reiserfs_write_lock(sb); + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { + err = -EINVAL; + goto out; + } + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) { + err = -EXDEV; + goto out; + } + inode = d_inode(path->dentry); + /* + * We must not pack tails for quota files on reiserfs for quota + * IO to work + */ + if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { + err = reiserfs_unpack(inode, NULL); + if (err) { + reiserfs_warning(sb, "super-6520", + "Unpacking tail of quota file failed" + " (%d). Cannot turn on quotas.", err); + err = -EINVAL; + goto out; + } + mark_inode_dirty(inode); + } + /* Journaling quota? */ + if (REISERFS_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (path->dentry->d_parent != sb->s_root) + reiserfs_warning(sb, "super-6521", + "Quota file not on filesystem root. " + "Journalled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (reiserfs_file_data_log(inode)) { + /* Just start temporary transaction and finish it */ + err = journal_begin(&th, sb, 1); + if (err) + goto out; + err = journal_end_sync(&th); + if (err) + goto out; + } + reiserfs_write_unlock(sb); + return dquot_quota_on(sb, type, format_id, path); +out: + reiserfs_write_unlock(sb); + return err; +} + +/* + * Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races + */ +static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + size_t toread; + struct buffer_head tmp_bh, *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = + sb->s_blocksize - offset < + toread ? sb->s_blocksize - offset : toread; + tmp_bh.b_state = 0; + /* + * Quota files are without tails so we can safely + * use this function + */ + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, 0); + reiserfs_write_unlock(sb); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* + * Write to quotafile (we know the transaction is already started and has + * enough credits) + */ +static ssize_t reiserfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head tmp_bh, *bh; + + if (!current->journal_info) { + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + tmp_bh.b_state = 0; + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + reiserfs_write_unlock(sb); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data + offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + reiserfs_write_lock(sb); + reiserfs_prepare_for_journal(sb, bh, 1); + journal_mark_dirty(current->journal_info, bh); + if (!journal_quota) + reiserfs_add_ordered_list(inode, bh); + reiserfs_write_unlock(sb); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off + len - towrite) + i_size_write(inode, off + len - towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return len - towrite; +} + +#endif + +static struct dentry *get_super_block(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); +} + +static int __init init_reiserfs_fs(void) +{ + int ret; + + ret = init_inodecache(); + if (ret) + return ret; + + reiserfs_proc_info_global_init(); + + ret = register_filesystem(&reiserfs_fs_type); + if (ret) + goto out; + + return 0; +out: + reiserfs_proc_info_global_done(); + destroy_inodecache(); + + return ret; +} + +static void __exit exit_reiserfs_fs(void) +{ + reiserfs_proc_info_global_done(); + unregister_filesystem(&reiserfs_fs_type); + destroy_inodecache(); +} + +struct file_system_type reiserfs_fs_type = { + .owner = THIS_MODULE, + .name = "reiserfs", + .mount = get_super_block, + .kill_sb = reiserfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("reiserfs"); + +MODULE_DESCRIPTION("ReiserFS journaled filesystem"); +MODULE_AUTHOR("Hans Reiser "); +MODULE_LICENSE("GPL"); + +module_init(init_reiserfs_fs); +module_exit(exit_reiserfs_fs); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c new file mode 100644 index 000000000..f41e19b4b --- /dev/null +++ b/fs/reiserfs/tail_conversion.c @@ -0,0 +1,317 @@ +/* + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright + * details + */ + +#include +#include +#include +#include "reiserfs.h" + +/* + * access to tail : when one is going to read tail it must make sure, that is + * not running. direct2indirect and indirect2direct can not run concurrently + */ + +/* + * Converts direct items to an unformatted node. Panics if file has no + * tail. -ENOSPC if no disk space for conversion + */ +/* + * path points to first direct item of the file regardless of how many of + * them are there + */ +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, + struct treepath *path, struct buffer_head *unbh, + loff_t tail_offset) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *up_to_date_bh; + struct item_head *p_le_ih = tp_item_head(path); + unsigned long total_tail = 0; + + /* Key to search for the last byte of the converted item. */ + struct cpu_key end_key; + + /* + * new indirect item to be inserted or key + * of unfm pointer to be pasted + */ + struct item_head ind_ih; + int blk_size; + /* returned value for reiserfs_insert_item and clones */ + int retval; + /* Handle on an unformatted node that will be inserted in the tree. */ + unp_t unfm_ptr; + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_direct2indirect++; + + blk_size = sb->s_blocksize; + + /* + * and key to search for append or insert pointer to the new + * unformatted node. + */ + copy_item_head(&ind_ih, p_le_ih); + set_le_ih_k_offset(&ind_ih, tail_offset); + set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + /* FIXME: we could avoid this */ + if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { + reiserfs_error(sb, "PAP-14030", + "pasted or inserted byte exists in " + "the tree %K. Use fsck to repair.", &end_key); + pathrelse(path); + return -EIO; + } + + p_le_ih = tp_item_head(path); + + unfm_ptr = cpu_to_le32(unbh->b_blocknr); + + if (is_statdata_le_ih(p_le_ih)) { + /* Insert new indirect item. */ + set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ + put_ih_item_len(&ind_ih, UNFM_P_SIZE); + PATH_LAST_POSITION(path)++; + retval = + reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, + (char *)&unfm_ptr); + } else { + /* Paste into last indirect item of an object. */ + retval = reiserfs_paste_into_item(th, path, &end_key, inode, + (char *)&unfm_ptr, + UNFM_P_SIZE); + } + if (retval) { + return retval; + } + /* + * note: from here there are two keys which have matching first + * three key components. They only differ by the fourth one. + */ + + /* Set the key to search for the direct items of the file */ + make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, + 4); + + /* + * Move bytes from the direct items to the new unformatted node + * and delete them. + */ + while (1) { + int tail_size; + + /* + * end_key.k_offset is set so, that we will always have found + * last item of the file + */ + if (search_for_position_by_key(sb, &end_key, path) == + POSITION_FOUND) + reiserfs_panic(sb, "PAP-14050", + "direct item (%K) not found", &end_key); + p_le_ih = tp_item_head(path); + RFALSE(!is_direct_le_ih(p_le_ih), + "vs-14055: direct item expected(%K), found %h", + &end_key, p_le_ih); + tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + + ih_item_len(p_le_ih) - 1; + + /* + * we only send the unbh pointer if the buffer is not + * up to date. this avoids overwriting good data from + * writepage() with old data from the disk or buffer cache + * Special case: unbh->b_page will be NULL if we are coming + * through DIRECT_IO handler here. + */ + if (!unbh->b_page || buffer_uptodate(unbh) + || PageUptodate(unbh->b_page)) { + up_to_date_bh = NULL; + } else { + up_to_date_bh = unbh; + } + retval = reiserfs_delete_item(th, path, &end_key, inode, + up_to_date_bh); + + total_tail += retval; + + /* done: file does not have direct items anymore */ + if (tail_size == retval) + break; + + } + /* + * if we've copied bytes from disk into the page, we need to zero + * out the unused part of the block (it was not up to date before) + */ + if (up_to_date_bh) { + unsigned pgoff = + (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + char *kaddr = kmap_atomic(up_to_date_bh->b_page); + memset(kaddr + pgoff, 0, blk_size - total_tail); + kunmap_atomic(kaddr); + } + + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + + return 0; +} + +/* stolen from fs/buffer.c */ +void reiserfs_unmap_buffer(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG(); + } + clear_buffer_dirty(bh); + /* + * Remove the buffer from whatever list it belongs to. We are mostly + * interested in removing it from per-sb j_dirty_buffers list, to avoid + * BUG() on attempt to write not mapped buffer + */ + if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { + struct inode *inode = bh->b_page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + spin_lock(&j->j_dirty_buffers_lock); + list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); + spin_unlock(&j->j_dirty_buffers_lock); + } + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + unlock_buffer(bh); +} + +/* + * this first locks inode (neither reads nor sync are permitted), + * reads tail through page cache, insert direct item. When direct item + * inserted successfully inode is left locked. Return value is always + * what we expect from it (number of cut bytes). But when tail remains + * in the unformatted node, we set mode to SKIP_BALANCING and unlock + * inode + */ +int indirect2direct(struct reiserfs_transaction_handle *th, + struct inode *inode, struct page *page, + struct treepath *path, /* path to the indirect item. */ + const struct cpu_key *item_key, /* Key to look for + * unformatted node + * pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char *mode) +{ + struct super_block *sb = inode->i_sb; + struct item_head s_ih; + unsigned long block_size = sb->s_blocksize; + char *tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_indirect2direct++; + + *mode = M_SKIP_BALANCING; + + /* store item head path points to. */ + copy_item_head(&s_ih, tp_item_head(path)); + + tail_len = (n_new_file_size & (block_size - 1)); + if (get_inode_sd_version(inode) == STAT_DATA_V2) + round_tail_len = ROUND_UP(tail_len); + else + round_tail_len = tail_len; + + pos = + le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + pos1 = pos; + + /* + * we are protected by i_mutex. The tail can not disapper, not + * append can be done either + * we are in truncate or packing tail in file_release + */ + + tail = (char *)kmap(page); /* this can schedule */ + + if (path_changed(&s_ih, path)) { + /* re-search indirect item */ + if (search_for_position_by_key(sb, item_key, path) + == POSITION_NOT_FOUND) + reiserfs_panic(sb, "PAP-5520", + "item to be converted %K does not exist", + item_key); + copy_item_head(&s_ih, tp_item_head(path)); +#ifdef CONFIG_REISERFS_CHECK + pos = le_ih_k_offset(&s_ih) - 1 + + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * sb->s_blocksize; + if (pos != pos1) + reiserfs_panic(sb, "vs-5530", "tail position " + "changed while we were reading it"); +#endif + } + + /* Set direct item header to insert. */ + make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode), + pos1 + 1, TYPE_DIRECT, round_tail_len, + 0xffff /*ih_free_space */ ); + + /* + * we want a pointer to the first byte of the tail in the page. + * the page was locked and this part of the page was up to date when + * indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); + + PATH_LAST_POSITION(path)++; + + key = *item_key; + set_cpu_key_k_type(&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if (reiserfs_insert_item(th, path, &key, &s_ih, inode, + tail ? tail : NULL) < 0) { + /* + * No disk memory. So we can not convert last unformatted node + * to the direct item. In this case we used to adjust + * indirect items's ih_free_space. Now ih_free_space is not + * used, it would be ideal to write zeros to corresponding + * unformatted node. For now i_size is considered as guard for + * going out of file size + */ + kunmap(page); + return block_size - round_tail_len; + } + kunmap(page); + + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, inode); + + /* + * note: we have now the same as in above direct2indirect + * conversion: there are two keys which have matching first three + * key components. They only differ by the fourth one. + */ + + /* + * We have inserted new direct item and must remove last + * unformatted node. + */ + *mode = M_CUT; + + /* we store position of first direct item in the in-core inode */ + /* mark_file_with_tail (inode, pos1 + 1); */ + REISERFS_I(inode)->i_first_direct_byte = pos1 + 1; + + return block_size - round_tail_len; +} diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c new file mode 100644 index 000000000..e87f9b52b --- /dev/null +++ b/fs/reiserfs/xattr.c @@ -0,0 +1,1064 @@ +/* + * linux/fs/reiserfs/xattr.c + * + * Copyright (c) 2002 by Jeff Mahoney, + * + */ + +/* + * In order to implement EA/ACLs in a clean, backwards compatible manner, + * they are implemented as files in a "private" directory. + * Each EA is in it's own file, with the directory layout like so (/ is assumed + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, + * directories named using the capital-hex form of the objectid and + * generation number are used. Inside each directory are individual files + * named with the name of the extended attribute. + * + * So, for objectid 12648430, we could have: + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type + * .. or similar. + * + * The file contents are the text of the EA. The size is known based on the + * stat data describing the file. + * + * In the case of system.posix_acl_access and system.posix_acl_default, since + * these are special cases for filesystem ACLs, they are interpreted by the + * kernel, in addition, they are negatively and positively cached and attached + * to the inode so that unnecessary lookups are avoided. + * + * Locking works like so: + * Directory components (xattr root, xattr dir) are protectd by their i_mutex. + * The xattrs themselves are protected by the xattr_sem. + */ + +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include +#include +#include +#include +#include +#include + +#define PRIVROOT_NAME ".reiserfs_priv" +#define XAROOT_NAME "xattrs" + + +/* + * Helpers for inode ops. We do this so that we don't have all the VFS + * overhead and also for proper i_mutex annotation. + * dir->i_mutex must be held for all of them. + */ +#ifdef CONFIG_REISERFS_FS_XATTR +static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->create(dir, dentry, mode, true); +} +#endif + +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + return dir->i_op->mkdir(dir, dentry, mode); +} + +/* + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr + * mutation ops aren't called during rename or splace, which are the + * only other users of I_MUTEX_CHILD. It violates the ordering, but that's + * better than allocating another subclass just for this code. + */ +static int xattr_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + error = dir->i_op->unlink(dir, dentry); + mutex_unlock(&d_inode(dentry)->i_mutex); + + if (!error) + d_delete(dentry); + return error; +} + +static int xattr_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + BUG_ON(!mutex_is_locked(&dir->i_mutex)); + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + error = dir->i_op->rmdir(dir, dentry); + if (!error) + d_inode(dentry)->i_flags |= S_DEAD; + mutex_unlock(&d_inode(dentry)->i_mutex); + if (!error) + d_delete(dentry); + + return error; +} + +#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) + +static struct dentry *open_xa_root(struct super_block *sb, int flags) +{ + struct dentry *privroot = REISERFS_SB(sb)->priv_root; + struct dentry *xaroot; + + if (d_really_is_negative(privroot)) + return ERR_PTR(-ENODATA); + + mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + + xaroot = dget(REISERFS_SB(sb)->xattr_root); + if (!xaroot) + xaroot = ERR_PTR(-ENODATA); + else if (d_really_is_negative(xaroot)) { + int err = -ENODATA; + + if (xattr_may_create(flags)) + err = xattr_mkdir(d_inode(privroot), xaroot, 0700); + if (err) { + dput(xaroot); + xaroot = ERR_PTR(err); + } + } + + mutex_unlock(&d_inode(privroot)->i_mutex); + return xaroot; +} + +static struct dentry *open_xa_dir(const struct inode *inode, int flags) +{ + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = open_xa_root(inode->i_sb, flags); + if (IS_ERR(xaroot)) + return xaroot; + + snprintf(namebuf, sizeof(namebuf), "%X.%X", + le32_to_cpu(INODE_PKEY(inode)->k_objectid), + inode->i_generation); + + mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + + xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { + int err = -ENODATA; + + if (xattr_may_create(flags)) + err = xattr_mkdir(d_inode(xaroot), xadir, 0700); + if (err) { + dput(xadir); + xadir = ERR_PTR(err); + } + } + + mutex_unlock(&d_inode(xaroot)->i_mutex); + dput(xaroot); + return xadir; +} + +/* + * The following are side effects of other operations that aren't explicitly + * modifying extended attributes. This includes operations such as permissions + * or ownership changes, object deletions, etc. + */ +struct reiserfs_dentry_buf { + struct dir_context ctx; + struct dentry *xadir; + int count; + struct dentry *dentries[8]; +}; + +static int +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct reiserfs_dentry_buf *dbuf = + container_of(ctx, struct reiserfs_dentry_buf, ctx); + struct dentry *dentry; + + WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + + if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) + return -ENOSPC; + + if (name[0] == '.' && (namelen < 2 || + (namelen == 2 && name[1] == '.'))) + return 0; + + dentry = lookup_one_len(name, dbuf->xadir, namelen); + if (IS_ERR(dentry)) { + return PTR_ERR(dentry); + } else if (d_really_is_negative(dentry)) { + /* A directory entry exists, but no file? */ + reiserfs_error(dentry->d_sb, "xattr-20003", + "Corrupted directory: xattr %pd listed but " + "not found for file %pd.\n", + dentry, dbuf->xadir); + dput(dentry); + return -EIO; + } + + dbuf->dentries[dbuf->count++] = dentry; + return 0; +} + +static void +cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) +{ + int i; + + for (i = 0; i < buf->count; i++) + if (buf->dentries[i]) + dput(buf->dentries[i]); +} + +static int reiserfs_for_each_xattr(struct inode *inode, + int (*action)(struct dentry *, void *), + void *data) +{ + struct dentry *dir; + int i, err = 0; + struct reiserfs_dentry_buf buf = { + .ctx.actor = fill_with_dentries, + }; + + /* Skip out, an xattr has no xattrs associated with it */ + if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) + return 0; + + dir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } else if (d_really_is_negative(dir)) { + err = 0; + goto out_dir; + } + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + + buf.xadir = dir; + while (1) { + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + if (err) + break; + if (!buf.count) + break; + for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { + struct dentry *dentry = buf.dentries[i]; + + if (!d_is_dir(dentry)) + err = action(dentry, data); + + dput(dentry); + buf.dentries[i] = NULL; + } + if (err) + break; + buf.count = 0; + } + mutex_unlock(&d_inode(dir)->i_mutex); + + cleanup_dentry_buf(&buf); + + if (!err) { + /* + * We start a transaction here to avoid a ABBA situation + * between the xattr root's i_mutex and the journal lock. + * This doesn't incur much additional overhead since the + * new transaction will just nest inside the + * outer transaction. + */ + int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; + + reiserfs_write_lock(inode->i_sb); + err = journal_begin(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); + if (!err) { + int jerror; + + mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + I_MUTEX_XATTR); + err = action(dir, data); + reiserfs_write_lock(inode->i_sb); + jerror = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + err = jerror ?: err; + } + } +out_dir: + dput(dir); +out: + /* -ENODATA isn't an error */ + if (err == -ENODATA) + err = 0; + return err; +} + +static int delete_one_xattr(struct dentry *dentry, void *data) +{ + struct inode *dir = d_inode(dentry->d_parent); + + /* This is the xattr dir, handle specially. */ + if (d_is_dir(dentry)) + return xattr_rmdir(dir, dentry); + + return xattr_unlink(dir, dentry); +} + +static int chown_one_xattr(struct dentry *dentry, void *data) +{ + struct iattr *attrs = data; + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; +} + +/* No i_mutex, but the inode is unconnected. */ +int reiserfs_delete_xattrs(struct inode *inode) +{ + int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + + if (err) + reiserfs_warning(inode->i_sb, "jdm-20004", + "Couldn't delete all xattrs (%d)\n", err); + return err; +} + +/* inode->i_mutex: down */ +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) +{ + int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + + if (err) + reiserfs_warning(inode->i_sb, "jdm-20007", + "Couldn't chown all xattrs (%d)\n", err); + return err; +} + +#ifdef CONFIG_REISERFS_FS_XATTR +/* + * Returns a dentry corresponding to a specific extended attribute file + * for the inode. If flags allow, the file is created. Otherwise, a + * valid or negative dentry, or an error is returned. + */ +static struct dentry *xattr_lookup(struct inode *inode, const char *name, + int flags) +{ + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir(inode, flags); + if (IS_ERR(xadir)) + return ERR_CAST(xadir); + + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + xafile = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(xafile)) { + err = PTR_ERR(xafile); + goto out; + } + + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) + err = -EEXIST; + + if (d_really_is_negative(xafile)) { + err = -ENODATA; + if (xattr_may_create(flags)) + err = xattr_create(d_inode(xadir), xafile, + 0700|S_IFREG); + } + + if (err) + dput(xafile); +out: + mutex_unlock(&d_inode(xadir)->i_mutex); + dput(xadir); + if (err) + return ERR_PTR(err); + return xafile; +} + +/* Internal operations on file data */ +static inline void reiserfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static struct page *reiserfs_get_page(struct inode *dir, size_t n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* + * We can deadlock if we try to free dentries, + * and an unlink/rmdir has just occurred - GFP_NOFS avoids this + */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); + if (!IS_ERR(page)) { + kmap(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline __u32 xattr_hash(const char *msg, int len) +{ + return csum_partial(msg, len, 0); +} + +int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); + +static void update_ctime(struct inode *inode) +{ + struct timespec now = current_fs_time(inode->i_sb); + + if (inode_unhashed(inode) || !inode->i_nlink || + timespec_equal(&inode->i_ctime, &now)) + return; + + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +} + +static int lookup_and_delete_xattr(struct inode *inode, const char *name) +{ + int err = 0; + struct dentry *dentry, *xadir; + + xadir = open_xa_dir(inode, XATTR_REPLACE); + if (IS_ERR(xadir)) + return PTR_ERR(xadir); + + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + dentry = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_dput; + } + + if (d_really_is_positive(dentry)) { + err = xattr_unlink(d_inode(xadir), dentry); + update_ctime(inode); + } + + dput(dentry); +out_dput: + mutex_unlock(&d_inode(xadir)->i_mutex); + dput(xadir); + return err; +} + + +/* Generic extended attribute operations that can be used by xa plugins */ + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, + struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + int err = 0; + struct dentry *dentry; + struct page *page; + char *data; + size_t file_pos = 0; + size_t buffer_pos = 0; + size_t new_size; + __u32 xahash = 0; + + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + return err; + } + + dentry = xattr_lookup(inode, name, flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + down_write(&REISERFS_I(inode)->i_xattr_sem); + + xahash = xattr_hash(buffer, buffer_size); + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page(d_inode(dentry), file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + + skip = file_pos = sizeof(struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32(xahash); + } + + reiserfs_write_lock(inode->i_sb); + err = __reiserfs_write_begin(page, page_offset, chunk + skip); + if (!err) { + if (buffer) + memcpy(data + skip, buffer + buffer_pos, chunk); + err = reiserfs_commit_write(NULL, page, page_offset, + page_offset + chunk + + skip); + } + reiserfs_write_unlock(inode->i_sb); + unlock_page(page); + reiserfs_put_page(page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + new_size = buffer_size + sizeof(struct reiserfs_xattr_header); + if (!err && new_size < i_size_read(d_inode(dentry))) { + struct iattr newattrs = { + .ia_ctime = current_fs_time(inode->i_sb), + .ia_size = new_size, + .ia_valid = ATTR_SIZE | ATTR_CTIME, + }; + + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_dio_wait(d_inode(dentry)); + + err = reiserfs_setattr(dentry, &newattrs); + mutex_unlock(&d_inode(dentry)->i_mutex); + } else + update_ctime(inode); +out_unlock: + up_write(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + return err; +} + +/* We need to start a transaction to maintain lock ordering */ +int reiserfs_xattr_set(struct inode *inode, const char *name, + const void *buffer, size_t buffer_size, int flags) +{ + + struct reiserfs_transaction_handle th; + int error, error2; + size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); + + if (!(flags & XATTR_REPLACE)) + jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); + if (error) { + return error; + } + + error = reiserfs_xattr_set_handle(&th, inode, name, + buffer, buffer_size, flags); + + reiserfs_write_lock(inode->i_sb); + error2 = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error == 0) + error = error2; + + return error; +} + +/* + * inode->i_mutex: down + */ +int +reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, + size_t buffer_size) +{ + ssize_t err = 0; + struct dentry *dentry; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* + * We can't have xattrs attached to v1 items since they don't have + * generation numbers + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dentry = xattr_lookup(inode, name, XATTR_REPLACE); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + + down_read(&REISERFS_I(inode)->i_xattr_sem); + + isize = i_size_read(d_inode(dentry)); + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof(struct reiserfs_xattr_header); + goto out_unlock; + } + + if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_unlock; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page(d_inode(dentry), file_pos); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_unlock; + } + + lock_page(page); + data = page_address(page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { + unlock_page(page); + reiserfs_put_page(page); + reiserfs_warning(inode->i_sb, "jdm-20001", + "Invalid magic for xattr (%s) " + "associated with %k", name, + INODE_PKEY(inode)); + err = -EIO; + goto out_unlock; + } + hash = le32_to_cpu(rxh->h_hash); + } + memcpy(buffer + buffer_pos, data + skip, chunk); + unlock_page(page); + reiserfs_put_page(page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof(struct reiserfs_xattr_header); + + if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != + hash) { + reiserfs_warning(inode->i_sb, "jdm-20002", + "Invalid hash for xattr (%s) associated " + "with %k", name, INODE_PKEY(inode)); + err = -EIO; + } + +out_unlock: + up_read(&REISERFS_I(inode)->i_xattr_sem); + dput(dentry); + +out: + return err; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix with the generic xattr API, a filesystem should create a + * null-terminated array of struct xattr_handler (one for each prefix) and + * hang a pointer to it off of the s_xattr field of the superblock. + * + * The generic_fooxattr() functions will use this list to dispatch xattr + * operations to the correct xattr_handler. + */ +#define for_each_xattr_handler(handlers, handler) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* This is the implementation for the xattr plugin infrastructure */ +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, + const char *name) +{ + const struct xattr_handler *xah; + + if (!handlers) + return NULL; + + for_each_xattr_handler(handlers, xah) { + if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0) + break; + } + + return xah; +} + + +/* + * Inode operation getxattr() + */ +ssize_t +reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, + size_t size) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->get(dentry, name, buffer, size, handler->flags); +} + +/* + * Inode operation setxattr() + * + * d_inode(dentry)->i_mutex down + */ +int +reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, value, size, flags, handler->flags); +} + +/* + * Inode operation removexattr() + * + * d_inode(dentry)->i_mutex down + */ +int reiserfs_removexattr(struct dentry *dentry, const char *name) +{ + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); + + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); +} + +struct listxattr_buf { + struct dir_context ctx; + size_t size; + size_t pos; + char *buf; + struct dentry *dentry; +}; + +static int listxattr_filler(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct listxattr_buf *b = + container_of(ctx, struct listxattr_buf, ctx); + size_t size; + + if (name[0] != '.' || + (namelen != 1 && (name[1] != '.' || namelen != 2))) { + const struct xattr_handler *handler; + + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, + name); + if (!handler) /* Unsupported xattr name */ + return 0; + if (b->buf) { + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); + if (size > b->size) + return -ERANGE; + } else { + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); + } + + b->pos += size; + } + return 0; +} + +/* + * Inode operation listxattr() + * + * We totally ignore the generic listxattr here because it would be stupid + * not to. Since the xattrs are organized in a directory, we can just + * readdir to find them. + */ +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) +{ + struct dentry *dir; + int err = 0; + struct listxattr_buf buf = { + .ctx.actor = listxattr_filler, + .dentry = dentry, + .buf = buffer, + .size = buffer ? size : 0, + }; + + if (d_really_is_negative(dentry)) + return -EINVAL; + + if (!dentry->d_sb->s_xattr || + get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + return -EOPNOTSUPP; + + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + mutex_unlock(&d_inode(dir)->i_mutex); + + if (!err) + err = buf.pos; + + dput(dir); +out: + return err; +} + +static int create_privroot(struct dentry *dentry) +{ + int err; + struct inode *inode = d_inode(dentry->d_parent); + + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + + err = xattr_mkdir(inode, dentry, 0700); + if (err || d_really_is_negative(dentry)) { + reiserfs_warning(dentry->d_sb, "jdm-20006", + "xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. " + "Failing mount."); + return -EOPNOTSUPP; + } + + d_inode(dentry)->i_flags |= S_PRIVATE; + reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " + "storage.\n", PRIVROOT_NAME); + + return 0; +} + +#else +int __init reiserfs_xattr_register_handlers(void) { return 0; } +void reiserfs_xattr_unregister_handlers(void) {} +static int create_privroot(struct dentry *dentry) { return 0; } +#endif + +/* Actual operations that are exported to VFS-land */ +static const struct xattr_handler *reiserfs_xattr_handlers[] = { +#ifdef CONFIG_REISERFS_FS_XATTR + &reiserfs_xattr_user_handler, + &reiserfs_xattr_trusted_handler, +#endif +#ifdef CONFIG_REISERFS_FS_SECURITY + &reiserfs_xattr_security_handler, +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static int xattr_mount_check(struct super_block *s) +{ + /* + * We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. + */ + if (old_format_only(s)) { + if (reiserfs_xattrs_optional(s)) { + /* + * Old format filesystem, but optional xattrs have + * been enabled. Error out. + */ + reiserfs_warning(s, "jdm-2005", + "xattrs/ACLs not supported " + "on pre-v3.6 format filesystems. " + "Failing mount."); + return -EOPNOTSUPP; + } + } + + return 0; +} + +int reiserfs_permission(struct inode *inode, int mask) +{ + /* + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. + */ + if (IS_PRIVATE(inode)) + return 0; + + return generic_permission(inode, mask); +} + +static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) +{ + return -EPERM; +} + +static const struct dentry_operations xattr_lookup_poison_ops = { + .d_revalidate = xattr_hide_revalidate, +}; + +int reiserfs_lookup_privroot(struct super_block *s) +{ + struct dentry *dentry; + int err = 0; + + /* If we don't have the privroot located yet - go find it */ + mutex_lock(&d_inode(s->s_root)->i_mutex); + dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, + strlen(PRIVROOT_NAME)); + if (!IS_ERR(dentry)) { + REISERFS_SB(s)->priv_root = dentry; + d_set_d_op(dentry, &xattr_lookup_poison_ops); + if (d_really_is_positive(dentry)) + d_inode(dentry)->i_flags |= S_PRIVATE; + } else + err = PTR_ERR(dentry); + mutex_unlock(&d_inode(s->s_root)->i_mutex); + + return err; +} + +/* + * We need to take a copy of the mount flags since things like + * MS_RDONLY don't get set until *after* we're called. + * mount_flags != mount_options + */ +int reiserfs_xattr_init(struct super_block *s, int mount_flags) +{ + int err = 0; + struct dentry *privroot = REISERFS_SB(s)->priv_root; + + err = xattr_mount_check(s); + if (err) + goto error; + + if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + mutex_lock(&d_inode(s->s_root)->i_mutex); + err = create_privroot(REISERFS_SB(s)->priv_root); + mutex_unlock(&d_inode(s->s_root)->i_mutex); + } + + if (d_really_is_positive(privroot)) { + s->s_xattr = reiserfs_xattr_handlers; + mutex_lock(&d_inode(privroot)->i_mutex); + if (!REISERFS_SB(s)->xattr_root) { + struct dentry *dentry; + + dentry = lookup_one_len(XAROOT_NAME, privroot, + strlen(XAROOT_NAME)); + if (!IS_ERR(dentry)) + REISERFS_SB(s)->xattr_root = dentry; + else + err = PTR_ERR(dentry); + } + mutex_unlock(&d_inode(privroot)->i_mutex); + } + +error: + if (err) { + clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt); + clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + if (reiserfs_posixacl(s)) + s->s_flags |= MS_POSIXACL; + else + s->s_flags &= ~MS_POSIXACL; + + return err; +} diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h new file mode 100644 index 000000000..15dde6262 --- /dev/null +++ b/fs/reiserfs/xattr.h @@ -0,0 +1,122 @@ +#include +#include +#include +#include + +struct inode; +struct dentry; +struct iattr; +struct super_block; + +int reiserfs_xattr_register_handlers(void) __init; +void reiserfs_xattr_unregister_handlers(void); +int reiserfs_xattr_init(struct super_block *sb, int mount_flags); +int reiserfs_lookup_privroot(struct super_block *sb); +int reiserfs_delete_xattrs(struct inode *inode); +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); +int reiserfs_permission(struct inode *inode, int mask); + +#ifdef CONFIG_REISERFS_FS_XATTR +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +int reiserfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int reiserfs_removexattr(struct dentry *dentry, const char *name); + +int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); +int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); +int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *, + struct inode *, const char *, const void *, + size_t, int); + +extern const struct xattr_handler reiserfs_xattr_user_handler; +extern const struct xattr_handler reiserfs_xattr_trusted_handler; +extern const struct xattr_handler reiserfs_xattr_security_handler; +#ifdef CONFIG_REISERFS_FS_SECURITY +int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec); +int reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec); +void reiserfs_security_free(struct reiserfs_security_handle *sec); +#endif + +static inline int reiserfs_xattrs_initialized(struct super_block *sb) +{ + return REISERFS_SB(sb)->priv_root != NULL; +} + +#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) +static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) +{ + loff_t ret = 0; + if (reiserfs_file_data_log(inode)) { + ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize); + ret >>= inode->i_sb->s_blocksize_bits; + } + return ret; +} + +/* + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file. + * Let's try to be smart about it. + * xattr root: We cache it. If it's not cached, we may need to create it. + * xattr dir: If anything has been loaded for this inode, we can set a flag + * saying so. + * xattr file: Since we don't cache xattrs, we can't tell. We always include + * blocks for it. + * + * However, since root and dir can be created between calls - YOU MUST SAVE + * THIS VALUE. + */ +static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) +{ + size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + + if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + } + + return nblocks; +} + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ + init_rwsem(&REISERFS_I(inode)->i_xattr_sem); +} + +#else + +#define reiserfs_getxattr NULL +#define reiserfs_setxattr NULL +#define reiserfs_listxattr NULL +#define reiserfs_removexattr NULL + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ +} +#endif /* CONFIG_REISERFS_FS_XATTR */ + +#ifndef CONFIG_REISERFS_FS_SECURITY +static inline int reiserfs_security_init(struct inode *dir, + struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec) +{ + return 0; +} +static inline int +reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec) +{ + return 0; +} +static inline void reiserfs_security_free(struct reiserfs_security_handle *sec) +{} +#endif diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c new file mode 100644 index 000000000..4b34b9dc0 --- /dev/null +++ b/fs/reiserfs/xattr_acl.c @@ -0,0 +1,407 @@ +#include +#include +#include +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include "acl.h" +#include + +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, + struct inode *inode, int type, + struct posix_acl *acl); + + +int +reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error, error2; + struct reiserfs_transaction_handle th; + size_t jcreate_blocks; + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; + + + /* + * Pessimism: We can't assume that anything from the xattr root up + * has been created. + */ + + jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, size) * 2; + + reiserfs_write_lock(inode->i_sb); + error = journal_begin(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); + if (error == 0) { + error = __reiserfs_set_acl(&th, inode, type, acl); + reiserfs_write_lock(inode->i_sb); + error2 = journal_end(&th); + reiserfs_write_unlock(inode->i_sb); + if (error2) + error = error2; + } + + return error; +} + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(reiserfs_acl_header)) + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *) value)->a_version != + cpu_to_le32(REISERFS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(reiserfs_acl_header); + count = reiserfs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +{ + reiserfs_acl_header *ext_acl; + char *e; + int n; + + *size = reiserfs_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * + sizeof(reiserfs_acl_entry), + GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); + e = (char *)ext_acl + sizeof(reiserfs_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(reiserfs_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +{ + char *name, *value; + struct posix_acl *acl; + int size; + int retval; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = reiserfs_xattr_get(inode, name, NULL, 0); + if (size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return ERR_PTR(size); + } + + value = kmalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + retval = reiserfs_xattr_get(inode, name, value, size); + if (retval == -ENODATA || retval == -ENOSYS) { + /* + * This shouldn't actually happen as it should have + * been caught above.. but just in case + */ + acl = NULL; + } else if (retval < 0) { + acl = ERR_PTR(retval); + } else { + acl = reiserfs_posix_acl_from_disk(value, retval); + } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + kfree(value); + return acl; +} + +/* + * Inode operation set_posix_acl(). + * + * inode->i_mutex: down + * BKL held [before 2.5.x] + */ +static int +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, + int type, struct posix_acl *acl) +{ + char *name; + void *value = NULL; + size_t size = 0; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = reiserfs_posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0); + + /* + * Ensure that the inode gets dirtied if we're only using + * the mode bits and an old ACL didn't exist. We don't need + * to check if the inode is hashed here since we won't get + * called by reiserfs_inherit_default_acl(). + */ + if (error == -ENODATA) { + error = 0; + if (type == ACL_TYPE_ACCESS) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + } + + kfree(value); + + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +/* + * dir->i_mutex: locked, + * inode is new and not released into the wild yet + */ +int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, + struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK(inode->i_mode)) + return 0; + + /* + * ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from + */ + if (get_inode_sd_version(dir) == STAT_DATA_V1) + goto apply_umask; + + /* + * Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles + */ + if (IS_PRIVATE(dir)) { + inode->i_flags |= S_PRIVATE; + goto apply_umask; + } + + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + + if (default_acl) { + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!err) + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } + + return err; + +apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + return err; +} + +/* This is used to cache the default acl before a new object is created. + * The biggest reason for this is to get an idea of how many blocks will + * actually be required for the create operation if we must inherit an ACL. + * An ACL write can add up to 3 object creations and an additional file write + * so we'd prefer not to reserve that many blocks in the journal if we can. + * It also has the advantage of not loading the ACL with a transaction open, + * this may seem silly, but if the owner of the directory is doing the + * creation, the ACL may not be loaded since the permissions wouldn't require + * it. + * We return the number of blocks required for the transaction. + */ +int reiserfs_cache_default_acl(struct inode *inode) +{ + struct posix_acl *acl; + int nblocks = 0; + + if (IS_PRIVATE(inode)) + return 0; + + acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + + if (acl && !IS_ERR(acl)) { + int size = reiserfs_acl_size(acl->a_count); + + /* Other xattrs can be created during inode creation. We don't + * want to claim too many blocks, so we check to see if we + * we need to create the tree to the xattrs, and then we + * just want two files. */ + nblocks = reiserfs_xattr_jcreate_nblocks(inode); + nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* We need to account for writes + bitmaps for two files */ + nblocks += reiserfs_xattr_nblocks(inode, size) * 4; + posix_acl_release(acl); + } + + return nblocks; +} + +/* + * Called under i_mutex + */ +int reiserfs_acl_chmod(struct inode *inode) +{ + if (IS_PRIVATE(inode)) + return 0; + if (get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) + return 0; + + return posix_acl_chmod(inode, inode->i_mode); +} diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c new file mode 100644 index 000000000..9a3b0616f --- /dev/null +++ b/fs/reiserfs/xattr_security.c @@ -0,0 +1,120 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include +#include + +static int +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) +{ + const size_t len = namelen + 1; + + if (IS_PRIVATE(d_inode(dentry))) + return 0; + + if (list && len <= list_len) { + memcpy(list, name, namelen); + list[namelen] = '\0'; + } + + return len; +} + +/* Initializes the security context for a new inode and returns the number + * of blocks needed for the transaction. If successful, reiserfs_security + * must be released using reiserfs_security_free when the caller is done. */ +int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, + struct reiserfs_security_handle *sec) +{ + int blocks = 0; + int error; + + sec->name = NULL; + + /* Don't add selinux attributes on xattrs - they'll never get used */ + if (IS_PRIVATE(dir)) + return 0; + + error = security_old_inode_init_security(inode, dir, qstr, &sec->name, + &sec->value, &sec->length); + if (error) { + if (error == -EOPNOTSUPP) + error = 0; + + sec->name = NULL; + sec->value = NULL; + sec->length = 0; + return error; + } + + if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) { + blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, sec->length); + /* We don't want to count the directories twice if we have + * a default ACL. */ + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + } + return blocks; +} + +int reiserfs_security_write(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct reiserfs_security_handle *sec) +{ + int error; + if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value, + sec->length, XATTR_CREATE); + if (error == -ENODATA || error == -EOPNOTSUPP) + error = 0; + + return error; +} + +void reiserfs_security_free(struct reiserfs_security_handle *sec) +{ + kfree(sec->name); + kfree(sec->value); + sec->name = NULL; + sec->value = NULL; +} + +const struct xattr_handler reiserfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = security_get, + .set = security_set, + .list = security_list, +}; diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c new file mode 100644 index 000000000..e4f134371 --- /dev/null +++ b/fs/reiserfs/xattr_trusted.c @@ -0,0 +1,56 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include +#include "xattr.h" +#include + +static int +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return -EPERM; + + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + return 0; + + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = trusted_get, + .set = trusted_set, + .list = trusted_list, +}; diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c new file mode 100644 index 000000000..d0b08d3e5 --- /dev/null +++ b/fs/reiserfs/xattr_user.c @@ -0,0 +1,52 @@ +#include "reiserfs.h" +#include +#include +#include +#include +#include "xattr.h" +#include + +static int +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) +{ + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); +} + +static int +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) +{ + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return -EOPNOTSUPP; + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); +} + +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) +{ + const size_t len = name_len + 1; + + if (!reiserfs_xattrs_user(dentry->d_sb)) + return 0; + if (list && len <= list_size) { + memcpy(list, name, name_len); + list[name_len] = '\0'; + } + return len; +} + +const struct xattr_handler reiserfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = user_get, + .set = user_set, + .list = user_list, +}; -- cgit v1.2.3