diff options
Diffstat (limited to 'fs/ocfs2')
35 files changed, 2015 insertions, 1017 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index ce210d495..e27e65279 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -41,7 +41,8 @@ ocfs2-objs := \ quota_local.o \ quota_global.o \ xattr.o \ - acl.o + acl.o \ + filecheck.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c9..216243472 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) brelse(di_bh); return acl; } + +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); + posix_acl_release(acl); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + umode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + mode = inode->i_mode; + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); + } + } +cleanup: + posix_acl_release(acl); + return ret; +} diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 3fce68d08..2783a75b3 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d002579c6..e361d1a0c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_block *eb; u32 range; - /* - * In normal tree rotation process, we will never touch the - * tree branch above subtree_index and ocfs2_extend_rotate_transaction - * doesn't reserve the credits for them either. - * - * But we do have a special case here which will update the rightmost - * records for all the bh in the path. - * So we have to allocate extra credits and access them. - */ - ret = ocfs2_extend_trans(handle, subtree_index); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); @@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); - ret = ocfs2_extend_rotate_transaction(handle, subtree_root, + ret = ocfs2_extend_rotate_transaction(handle, 0, orig_credits, left_path); if (ret) { mlog_errno(ret); @@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(et); if (ret) goto out; - /* - * There's two ways we handle this depending on - * whether path is the only existing one. - */ - ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); - if (ret) { - mlog_errno(ret); - goto out; - } ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { @@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, */ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } ret = ocfs2_remove_rightmost_path(handle, et, right_path, @@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, BUG_ON(ctxt->c_contig_type == CONTIG_NONE); if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The merge code will need to create an empty * extent to take the place of the newly @@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* The merge left us with an empty extent, remove it. */ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { @@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, goto out; } + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so @@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, } if (ctxt->c_split_covers_rec) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + ret = 0; + goto out; + } + /* * The merge may have left an empty extent in * our leaf. Try to rotate it away. @@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { + /* extend credit for ocfs2_remove_rightmost_path */ + ret = ocfs2_extend_rotate_transaction(handle, 0, + handle->h_buffer_credits, + path); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); @@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, tl_bh); - /* TODO: Perhaps we can calculate the bulk of the - * credits up front rather than extending like - * this. */ - status = ocfs2_extend_trans(handle, - OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); - if (status < 0) { - mlog_errno(status); - goto bail; - } - rec = tl->tl_recs[i]; start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, le32_to_cpu(rec.t_start)); @@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, goto bail; } } + + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } i--; } @@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out_mutex; } - handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, if (cancel) cancel_delayed_work(&osb->osb_truncate_log_wq); - queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); } } @@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); status = ocfs2_flush_truncate_log(osb); if (status < 0) @@ -6648,7 +6671,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, { int i; struct page *page; - unsigned int from, to = PAGE_CACHE_SIZE; + unsigned int from, to = PAGE_SIZE; struct super_block *sb = inode->i_sb; BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); @@ -6656,21 +6679,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, if (numpages == 0) goto out; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; for(i = 0; i < numpages; i++) { page = pages[i]; - from = start & (PAGE_CACHE_SIZE - 1); - if ((end >> PAGE_CACHE_SHIFT) == page->index) - to = end & (PAGE_CACHE_SIZE - 1); + from = start & (PAGE_SIZE - 1); + if ((end >> PAGE_SHIFT) == page->index) + to = end & (PAGE_SIZE - 1); - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > PAGE_SIZE); + BUG_ON(to > PAGE_SIZE); ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, &phys); - start = (page->index + 1) << PAGE_CACHE_SHIFT; + start = (page->index + 1) << PAGE_SHIFT; } out: if (pages) @@ -6689,7 +6712,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages = 0; last_page_bytes = PAGE_ALIGN(end); - index = start >> PAGE_CACHE_SHIFT; + index = start >> PAGE_SHIFT; do { pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { @@ -6700,7 +6723,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, numpages++; index++; - } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); + } while (index < (last_page_bytes >> PAGE_SHIFT)); out: if (ret != 0) { @@ -6927,8 +6950,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * to do that now. */ if (!ocfs2_sparse_alloc(osb) && - PAGE_CACHE_SIZE < osb->s_clustersize) - end = PAGE_CACHE_SIZE; + PAGE_SIZE < osb->s_clustersize) + end = PAGE_SIZE; ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { @@ -6948,8 +6971,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_unlock; } - page_end = PAGE_CACHE_SIZE; - if (PAGE_CACHE_SIZE > osb->s_clustersize) + page_end = PAGE_SIZE; + if (PAGE_SIZE > osb->s_clustersize) page_end = osb->s_clustersize; for (i = 0; i < num_pages; i++) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index cda0361e9..ad1577348 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); - if (size > PAGE_CACHE_SIZE || + if (size > PAGE_SIZE || size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu\n", @@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, if (size) memcpy(kaddr, di->id2.i_data.id_data, size); /* Clear the remaining part of the page */ - memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); + memset(kaddr + size, 0, PAGE_SIZE - size); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t start = (loff_t)page->index << PAGE_SHIFT; int ret, unlock = 1; trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, @@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, * drop out in that case as it's not worth handling here. */ last = list_entry(pages->prev, struct page, lru); - start = (loff_t)last->index << PAGE_CACHE_SHIFT; + start = (loff_t)last->index << PAGE_SHIFT; if (start >= i_size_read(inode)) goto out_unlock; @@ -499,153 +499,6 @@ bail: return status; } -/* - * TODO: Make this into a generic get_blocks function. - * - * From do_direct_io in direct-io.c: - * "So what we do is to permit the ->get_blocks function to populate - * bh.b_size with the size of IO which is permitted at this offset and - * this i_blkbits." - * - * This function is called directly from get_more_blocks in direct-io.c. - * - * called like this: dio->get_blocks(dio->inode, fs_startblk, - * fs_count, map_bh, dio->rw == WRITE); - */ -static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - u32 cpos = 0; - int alloc_locked = 0; - u64 p_blkno, inode_blocks, contig_blocks; - unsigned int ext_flags; - unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; - unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; - unsigned long len = bh_result->b_size; - unsigned int clusters_to_alloc = 0, contig_clusters = 0; - - cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); - - /* This function won't even be called if the request isn't all - * nicely aligned and of the right size, so there's no need - * for us to check any of that. */ - - inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - - /* This figures out the size of the next contiguous block, and - * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, - &contig_blocks, &ext_flags); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - - if (ret) { - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", - (unsigned long long)iblock); - ret = -EIO; - goto bail; - } - - /* We should already CoW the refcounted extent in case of create. */ - BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); - - /* allocate blocks if no p_blkno is found, and create == 1 */ - if (!p_blkno && create) { - ret = ocfs2_inode_lock(inode, NULL, 1); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - - alloc_locked = 1; - - down_write(&OCFS2_I(inode)->ip_alloc_sem); - - /* fill hole, allocate blocks can't be larger than the size - * of the hole */ - clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); - contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, - contig_blocks); - if (clusters_to_alloc > contig_clusters) - clusters_to_alloc = contig_clusters; - - /* allocate extent and insert them into the extent tree */ - ret = ocfs2_extend_allocation(inode, cpos, - clusters_to_alloc, 0); - if (ret < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - mlog_errno(ret); - goto bail; - } - - ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, - &contig_blocks, &ext_flags); - if (ret < 0) { - up_write(&OCFS2_I(inode)->ip_alloc_sem); - mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", - (unsigned long long)iblock); - ret = -EIO; - goto bail; - } - set_buffer_new(bh_result); - up_write(&OCFS2_I(inode)->ip_alloc_sem); - } - - /* - * get_more_blocks() expects us to describe a hole by clearing - * the mapped bit on bh_result(). - * - * Consider an unwritten extent as a hole. - */ - if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) - map_bh(bh_result, inode->i_sb, p_blkno); - else - clear_buffer_mapped(bh_result); - - /* make sure we don't map more than max_blocks blocks here as - that's all the kernel will handle at this point. */ - if (max_blocks < contig_blocks) - contig_blocks = max_blocks; - bh_result->b_size = contig_blocks << blocksize_bits; -bail: - if (alloc_locked) - ocfs2_inode_unlock(inode, 1); - return ret; -} - -/* - * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. We use the rw_lock DLM lock - * to protect io on one node from truncation on another. - */ -static void ocfs2_dio_end_io(struct kiocb *iocb, - loff_t offset, - ssize_t bytes, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - int level; - - /* this io's submitter should not have unlocked this before we could */ - BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - - if (ocfs2_iocb_is_unaligned_aio(iocb)) { - ocfs2_iocb_clear_unaligned_aio(iocb); - - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); - } - - /* Let rw unlock to be done later to protect append direct io write */ - if (offset + bytes <= i_size_read(inode)) { - ocfs2_iocb_clear_rw_locked(iocb); - - level = ocfs2_iocb_rw_locked_level(iocb); - ocfs2_rw_unlock(inode, level); - } -} - static int ocfs2_releasepage(struct page *page, gfp_t wait) { if (!page_has_buffers(page)) @@ -653,374 +506,17 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -static int ocfs2_is_overwrite(struct ocfs2_super *osb, - struct inode *inode, loff_t offset) -{ - int ret = 0; - u32 v_cpos = 0; - u32 p_cpos = 0; - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - - v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, - &num_clusters, &ext_flags); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) - return 1; - - return 0; -} - -static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, - struct inode *inode, loff_t offset, - u64 zero_len, int cluster_align) -{ - u32 p_cpos = 0; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - int ret = 0; - - if (offset <= i_size_read(inode) || cluster_align) - return 0; - - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, - &ext_flags); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { - u64 s = i_size_read(inode); - sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + - (do_div(s, osb->s_clustersize) >> 9); - - ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, - zero_len >> 9, GFP_NOFS, false); - if (ret < 0) - mlog_errno(ret); - } - - return ret; -} - -static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, - struct inode *inode, loff_t offset) -{ - u64 zero_start, zero_len, total_zero_len; - u32 p_cpos = 0, clusters_to_add; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - u32 size_div, offset_div; - int ret = 0; - - { - u64 o = offset; - u64 s = i_size_read(inode); - - offset_div = do_div(o, osb->s_clustersize); - size_div = do_div(s, osb->s_clustersize); - } - - if (offset <= i_size_read(inode)) - return 0; - - clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - - ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); - total_zero_len = offset - i_size_read(inode); - if (clusters_to_add) - total_zero_len -= offset_div; - - /* Allocate clusters to fill out holes, and this is only needed - * when we add more than one clusters. Otherwise the cluster will - * be allocated during direct IO */ - if (clusters_to_add > 1) { - ret = ocfs2_extend_allocation(inode, - OCFS2_I(inode)->ip_clusters, - clusters_to_add - 1, 0); - if (ret) { - mlog_errno(ret); - goto out; - } - } - - while (total_zero_len) { - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, - &ext_flags); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + - size_div; - zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - - size_div; - zero_len = min(total_zero_len, zero_len); - - if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = blkdev_issue_zeroout(osb->sb->s_bdev, - zero_start >> 9, zero_len >> 9, - GFP_NOFS, false); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - total_zero_len -= zero_len; - v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); - - /* Only at first iteration can be cluster not aligned. - * So set size_div to 0 for the rest */ - size_div = 0; - } - -out: - return ret; -} - -static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, - struct iov_iter *iter, - loff_t offset) -{ - ssize_t ret = 0; - ssize_t written = 0; - bool orphaned = false; - int is_overwrite = 0; - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - size_t count = iter->count; - journal_t *journal = osb->journal->j_journal; - u64 zero_len_head, zero_len_tail; - int cluster_align_head, cluster_align_tail; - loff_t final_size = offset + count; - int append_write = offset >= i_size_read(inode) ? 1 : 0; - unsigned int num_clusters = 0; - unsigned int ext_flags = 0; - - { - u64 o = offset; - u64 s = i_size_read(inode); - - zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); - cluster_align_head = !zero_len_head; - - zero_len_tail = osb->s_clustersize - - do_div(s, osb->s_clustersize); - if ((offset - i_size_read(inode)) < zero_len_tail) - zero_len_tail = offset - i_size_read(inode); - cluster_align_tail = !zero_len_tail; - } - - /* - * when final_size > inode->i_size, inode->i_size will be - * updated after direct write, so add the inode to orphan - * dir first. - */ - if (final_size > i_size_read(inode)) { - ret = ocfs2_add_inode_to_orphan(osb, inode); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - orphaned = true; - } - - if (append_write) { - ret = ocfs2_inode_lock(inode, NULL, 1); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - /* zeroing out the previously allocated cluster tail - * that but not zeroed */ - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { - down_read(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, - zero_len_tail, cluster_align_tail); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - } else { - down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_direct_IO_extend_no_holes(osb, inode, - offset); - up_write(&OCFS2_I(inode)->ip_alloc_sem); - } - if (ret < 0) { - mlog_errno(ret); - ocfs2_inode_unlock(inode, 1); - goto clean_orphan; - } - - is_overwrite = ocfs2_is_overwrite(osb, inode, offset); - if (is_overwrite < 0) { - mlog_errno(is_overwrite); - ret = is_overwrite; - ocfs2_inode_unlock(inode, 1); - goto clean_orphan; - } - - ocfs2_inode_unlock(inode, 1); - } - - written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); - /* overwrite aio may return -EIOCBQUEUED, and it is not an error */ - if ((written < 0) && (written != -EIOCBQUEUED)) { - loff_t i_size = i_size_read(inode); - - if (offset + count > i_size) { - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - if (i_size == i_size_read(inode)) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; - goto clean_orphan; - } - } - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; - - ret = jbd2_journal_force_commit(journal); - if (ret < 0) - mlog_errno(ret); - } - } else if (written > 0 && append_write && !is_overwrite && - !cluster_align_head) { - /* zeroing out the allocated cluster head */ - u32 p_cpos = 0; - u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); - - ret = ocfs2_inode_lock(inode, NULL, 0); - if (ret < 0) { - mlog_errno(ret); - goto clean_orphan; - } - - ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, - &num_clusters, &ext_flags); - if (ret < 0) { - mlog_errno(ret); - ocfs2_inode_unlock(inode, 0); - goto clean_orphan; - } - - BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); - - ret = blkdev_issue_zeroout(osb->sb->s_bdev, - (u64)p_cpos << (osb->s_clustersize_bits - 9), - zero_len_head >> 9, GFP_NOFS, false); - if (ret < 0) - mlog_errno(ret); - - ocfs2_inode_unlock(inode, 0); - } - -clean_orphan: - if (orphaned) { - int tmp_ret; - int update_isize = written > 0 ? 1 : 0; - loff_t end = update_isize ? offset + written : 0; - - tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (tmp_ret < 0) { - ret = tmp_ret; - mlog_errno(ret); - goto out; - } - - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, - update_isize, end); - if (tmp_ret < 0) { - ocfs2_inode_unlock(inode, 1); - ret = tmp_ret; - mlog_errno(ret); - brelse(di_bh); - goto out; - } - - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - - tmp_ret = jbd2_journal_force_commit(journal); - if (tmp_ret < 0) { - ret = tmp_ret; - mlog_errno(tmp_ret); - } - } - -out: - if (ret >= 0) - ret = written; - return ret; -} - -static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int full_coherency = !(osb->s_mount_opt & - OCFS2_MOUNT_COHERENCY_BUFFERED); - - /* - * Fallback to buffered I/O if we see an inode without - * extents. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return 0; - - /* Fallback to buffered I/O if we are appending and - * concurrent O_DIRECT writes are allowed. - */ - if (i_size_read(inode) <= offset && !full_coherency) - return 0; - - if (iov_iter_rw(iter) == READ) - return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, offset, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); - else - return ocfs2_direct_IO_write(iocb, iter, offset); -} - static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, u32 cpos, unsigned int *start, unsigned int *end) { - unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + unsigned int cluster_start = 0, cluster_end = PAGE_SIZE; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) { unsigned int cpp; - cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits); cluster_start = cpos % cpp; cluster_start = cluster_start << osb->s_clustersize_bits; @@ -1188,13 +684,20 @@ next_bh: return ret; } -#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE) #define OCFS2_MAX_CTXT_PAGES 1 #else -#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE) #endif -#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE) + +struct ocfs2_unwritten_extent { + struct list_head ue_node; + struct list_head ue_ip_node; + u32 ue_cpos; + u32 ue_phys; +}; /* * Describe the state of a single cluster to be written to. @@ -1207,7 +710,7 @@ struct ocfs2_write_cluster_desc { * filled. */ unsigned c_new; - unsigned c_unwritten; + unsigned c_clear_unwritten; unsigned c_needs_zero; }; @@ -1219,6 +722,9 @@ struct ocfs2_write_ctxt { /* First cluster allocated in a nonsparse extend */ u32 w_first_new_cpos; + /* Type of caller. Must be one of buffer, mmap, direct. */ + ocfs2_write_type_t w_type; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -1267,6 +773,8 @@ struct ocfs2_write_ctxt { struct buffer_head *w_di_bh; struct ocfs2_cached_dealloc_ctxt w_dealloc; + + struct list_head w_unwritten_list; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1277,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) if (pages[i]) { unlock_page(pages[i]); mark_page_accessed(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } } } @@ -1300,13 +808,30 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) } } mark_page_accessed(wc->w_target_page); - page_cache_release(wc->w_target_page); + put_page(wc->w_target_page); } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); } -static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +static void ocfs2_free_unwritten_list(struct inode *inode, + struct list_head *head) { + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL; + + list_for_each_entry_safe(ue, tmp, head, ue_node) { + list_del(&ue->ue_node); + spin_lock(&oi->ip_lock); + list_del(&ue->ue_ip_node); + spin_unlock(&oi->ip_lock); + kfree(ue); + } +} + +static void ocfs2_free_write_ctxt(struct inode *inode, + struct ocfs2_write_ctxt *wc) +{ + ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); ocfs2_unlock_pages(wc); brelse(wc->w_di_bh); kfree(wc); @@ -1314,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, struct ocfs2_super *osb, loff_t pos, - unsigned len, struct buffer_head *di_bh) + unsigned len, ocfs2_write_type_t type, + struct buffer_head *di_bh) { u32 cend; struct ocfs2_write_ctxt *wc; @@ -1329,13 +855,15 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); wc->w_di_bh = di_bh; + wc->w_type = type; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) wc->w_large_pages = 1; else wc->w_large_pages = 0; ocfs2_init_dealloc_ctxt(&wc->w_dealloc); + INIT_LIST_HEAD(&wc->w_unwritten_list); *wcp = wc; @@ -1392,16 +920,17 @@ static void ocfs2_write_failure(struct inode *inode, loff_t user_pos, unsigned user_len) { int i; - unsigned from = user_pos & (PAGE_CACHE_SIZE - 1), + unsigned from = user_pos & (PAGE_SIZE - 1), to = user_pos + user_len; struct page *tmppage; - ocfs2_zero_new_buffers(wc->w_target_page, from, to); + if (wc->w_target_page) + ocfs2_zero_new_buffers(wc->w_target_page, from, to); for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; - if (page_has_buffers(tmppage)) { + if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); @@ -1431,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, (page_offset(page) <= user_pos)); if (page == wc->w_target_page) { - map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_from = user_pos & (PAGE_SIZE - 1); map_to = map_from + user_len; if (new) @@ -1505,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct inode *inode = mapping->host; loff_t last_byte; - target_index = user_pos >> PAGE_CACHE_SHIFT; + target_index = user_pos >> PAGE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For @@ -1524,18 +1053,20 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ last_byte = max(user_pos + user_len, i_size_read(inode)); BUG_ON(last_byte < 1); - end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1; if ((start + wc->w_num_pages) > end_index) wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; } + end_index = (user_pos + user_len - 1) >> PAGE_SHIFT; for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - if (index == target_index && mmap_page) { + if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_MMAP) { /* * ocfs2_pagemkwrite() is a little different * and wants us to directly use the page @@ -1551,9 +1082,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } - page_cache_get(mmap_page); + get_page(mmap_page); wc->w_pages[i] = mmap_page; wc->w_target_locked = true; + } else if (index >= target_index && index <= end_index && + wc->w_type == OCFS2_WRITE_DIRECT) { + /* Direct write has no mapping page. */ + wc->w_pages[i] = NULL; + continue; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1578,19 +1114,20 @@ out: * Prepare a single cluster for write one cluster into the file. */ static int ocfs2_write_cluster(struct address_space *mapping, - u32 phys, unsigned int unwritten, + u32 *phys, unsigned int new, + unsigned int clear_unwritten, unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new; - u64 v_blkno, p_blkno; + int ret, i; + u64 p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - new = phys == 0 ? 1 : 0; if (new) { u32 tmp_pos; @@ -1600,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, */ tmp_pos = cpos; ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + &tmp_pos, 1, !clear_unwritten, + wc->w_di_bh, wc->w_handle, + data_ac, meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1619,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping, mlog_errno(ret); goto out; } - } else if (unwritten) { + } else if (clear_unwritten) { ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), wc->w_di_bh); ret = ocfs2_mark_extent_written(inode, &et, - wc->w_handle, cpos, 1, phys, + wc->w_handle, cpos, 1, *phys, meta_ac, &wc->w_dealloc); if (ret < 0) { mlog_errno(ret); @@ -1631,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping, } } - if (should_zero) - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); - else - v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; - /* * The only reason this should fail is due to an inability to * find the extent added. */ - ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, - NULL); + ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL); if (ret < 0) { mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " - "at logical block %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_blkno); + "at logical cluster %u", + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); goto out; } - BUG_ON(p_blkno == 0); + BUG_ON(*phys == 0); + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys); + if (!should_zero) + p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1); for(i = 0; i < wc->w_num_pages; i++) { int tmpret; + /* This is the direct io target page. */ + if (wc->w_pages[i] == NULL) { + p_blkno++; + continue; + } + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, wc->w_pages[i], cpos, user_pos, user_len, @@ -1701,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, if ((cluster_off + local_len) > osb->s_clustersize) local_len = osb->s_clustersize - cluster_off; - ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, + ret = ocfs2_write_cluster(mapping, &desc->c_phys, + desc->c_new, + desc->c_clear_unwritten, desc->c_needs_zero, data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); @@ -1731,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, { struct ocfs2_write_cluster_desc *desc; - wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_from = pos & (PAGE_SIZE - 1); wc->w_target_to = wc->w_target_from + len; if (alloc == 0) @@ -1768,8 +1309,68 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, &wc->w_target_to); } else { wc->w_target_from = 0; - wc->w_target_to = PAGE_CACHE_SIZE; + wc->w_target_to = PAGE_SIZE; + } +} + +/* + * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to + * do the zero work. And should not to clear UNWRITTEN since it will be cleared + * by the direct io procedure. + * If this is a new extent that allocated by direct io, we should mark it in + * the ip_unwritten_list. + */ +static int ocfs2_unwritten_check(struct inode *inode, + struct ocfs2_write_ctxt *wc, + struct ocfs2_write_cluster_desc *desc) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL, *new = NULL; + int ret = 0; + + if (!desc->c_needs_zero) + return 0; + +retry: + spin_lock(&oi->ip_lock); + /* Needs not to zero no metter buffer or direct. The one who is zero + * the cluster is doing zero. And he will clear unwritten after all + * cluster io finished. */ + list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) { + if (desc->c_cpos == ue->ue_cpos) { + BUG_ON(desc->c_new); + desc->c_needs_zero = 0; + desc->c_clear_unwritten = 0; + goto unlock; + } } + + if (wc->w_type != OCFS2_WRITE_DIRECT) + goto unlock; + + if (new == NULL) { + spin_unlock(&oi->ip_lock); + new = kmalloc(sizeof(struct ocfs2_unwritten_extent), + GFP_NOFS); + if (new == NULL) { + ret = -ENOMEM; + goto out; + } + goto retry; + } + /* This direct write will doing zero. */ + new->ue_cpos = desc->c_cpos; + new->ue_phys = desc->c_phys; + desc->c_clear_unwritten = 0; + list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); + list_add_tail(&new->ue_node, &wc->w_unwritten_list); + new = NULL; +unlock: + spin_unlock(&oi->ip_lock); +out: + if (new) + kfree(new); + return ret; } /* @@ -1847,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode, if (phys == 0) { desc->c_new = 1; desc->c_needs_zero = 1; + desc->c_clear_unwritten = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } if (ext_flags & OCFS2_EXT_UNWRITTEN) { - desc->c_unwritten = 1; + desc->c_clear_unwritten = 1; desc->c_needs_zero = 1; } + ret = ocfs2_unwritten_check(inode, wc, desc); + if (ret) { + mlog_errno(ret); + goto out; + } + num_clusters--; } @@ -2017,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, if (ret) mlog_errno(ret); - wc->w_first_new_cpos = - ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + /* There is no wc if this is call from direct. */ + if (wc) + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); return ret; } @@ -2072,9 +1682,8 @@ out: return ret; } -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { @@ -2091,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp, int try_free = 1, ret1; try_again: - ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh); if (ret) { mlog_errno(ret); return ret; @@ -2110,14 +1719,17 @@ try_again: } } - if (ocfs2_sparse_alloc(osb)) - ret = ocfs2_zero_tail(inode, di_bh, pos); - else - ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, - wc); - if (ret) { - mlog_errno(ret); - goto out; + /* Direct io change i_size late, should not zero tail here. */ + if (type != OCFS2_WRITE_DIRECT) { + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + len, wc); + if (ret) { + mlog_errno(ret); + goto out; + } } ret = ocfs2_check_range_for_refcount(inode, pos, len); @@ -2148,7 +1760,7 @@ try_again: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), - pos, len, flags, mmap_page, + pos, len, type, mmap_page, clusters_to_alloc, extents_to_split); /* @@ -2178,17 +1790,17 @@ try_again: credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); - - } + } else if (type == OCFS2_WRITE_DIRECT) + /* direct write needs not to start trans if no extents alloc. */ + goto success; /* * We have to zero sparse allocated clusters, unwritten extent clusters, * and non-sparse clusters we just extended. For non-sparse writes, * we know zeros will only be needed in the first and/or last cluster. */ - if (clusters_to_alloc || extents_to_split || - (wc->w_clen && (wc->w_desc[0].c_needs_zero || - wc->w_desc[wc->w_clen - 1].c_needs_zero))) + if (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero)) cluster_of_pages = 1; else cluster_of_pages = 0; @@ -2255,7 +1867,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - *pagep = wc->w_target_page; + if (pagep) + *pagep = wc->w_target_page; *fsdata = wc; return 0; out_quota: @@ -2266,7 +1879,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out: - ocfs2_free_write_ctxt(wc); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { ocfs2_free_alloc_context(data_ac); @@ -2318,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, - fsdata, di_bh, NULL); + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, + pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -2368,7 +1981,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, struct page *page, void *fsdata) { int i, ret; - unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + unsigned from, to, start = pos & (PAGE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_write_ctxt *wc = fsdata; @@ -2376,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping, handle_t *handle = wc->w_handle; struct page *tmppage; - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - copied = ret; - mlog_errno(ret); - goto out; + BUG_ON(!list_empty(&wc->w_unwritten_list)); + + if (handle) { + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { @@ -2389,24 +2006,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping, goto out_write_size; } - if (unlikely(copied < len)) { + if (unlikely(copied < len) && wc->w_target_page) { if (!PageUptodate(wc->w_target_page)) copied = 0; ocfs2_zero_new_buffers(wc->w_target_page, start+copied, start+len); } - flush_dcache_page(wc->w_target_page); + if (wc->w_target_page) + flush_dcache_page(wc->w_target_page); for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; + /* This is the direct io target page. */ + if (tmppage == NULL) + continue; + if (tmppage == wc->w_target_page) { from = wc->w_target_from; to = wc->w_target_to; - BUG_ON(from > PAGE_CACHE_SIZE || - to > PAGE_CACHE_SIZE || + BUG_ON(from > PAGE_SIZE || + to > PAGE_SIZE || to < from); } else { /* @@ -2415,29 +2037,33 @@ int ocfs2_write_end_nolock(struct address_space *mapping, * to flush their entire range. */ from = 0; - to = PAGE_CACHE_SIZE; + to = PAGE_SIZE; } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + if (handle && ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(handle, inode); block_commit_write(tmppage, from, to); } } out_write_size: - pos += copied; - if (pos > i_size_read(inode)) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); - ocfs2_journal_dirty(handle, wc->w_di_bh); + /* Direct io do not update i_size here. */ + if (wc->w_type != OCFS2_WRITE_DIRECT) { + pos += copied; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + } + if (handle) + ocfs2_journal_dirty(handle, wc->w_di_bh); out: /* unlock pages before dealloc since it needs acquiring j_trans_barrier @@ -2447,7 +2073,8 @@ out: */ ocfs2_unlock_pages(wc); - ocfs2_commit_trans(osb, handle); + if (handle) + ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); @@ -2472,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, return ret; } +struct ocfs2_dio_write_ctxt { + struct list_head dw_zero_list; + unsigned dw_zero_count; + int dw_orphaned; + pid_t dw_writer_pid; +}; + +static struct ocfs2_dio_write_ctxt * +ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc) +{ + struct ocfs2_dio_write_ctxt *dwc = NULL; + + if (bh->b_private) + return bh->b_private; + + dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS); + if (dwc == NULL) + return NULL; + INIT_LIST_HEAD(&dwc->dw_zero_list); + dwc->dw_zero_count = 0; + dwc->dw_orphaned = 0; + dwc->dw_writer_pid = task_pid_nr(current); + bh->b_private = dwc; + *alloc = 1; + + return dwc; +} + +static void ocfs2_dio_free_write_ctx(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc) +{ + ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list); + kfree(dwc); +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_write_ctxt *wc; + struct ocfs2_write_cluster_desc *desc = NULL; + struct ocfs2_dio_write_ctxt *dwc = NULL; + struct buffer_head *di_bh = NULL; + u64 p_blkno; + loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned len, total_len = bh_result->b_size; + int ret = 0, first_get_block = 0; + + len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); + len = min(total_len, len); + + mlog(0, "get block of %lu at %llu:%u req %u\n", + inode->i_ino, pos, len, total_len); + + /* + * Because we need to change file size in ocfs2_dio_end_io_write(), or + * we may need to add it to orphan dir. So can not fall to fast path + * while file size will be changed. + */ + if (pos + total_len <= i_size_read(inode)) { + down_read(&oi->ip_alloc_sem); + /* This is the fast path for re-write. */ + ret = ocfs2_get_block(inode, iblock, bh_result, create); + + up_read(&oi->ip_alloc_sem); + + if (buffer_mapped(bh_result) && + !buffer_new(bh_result) && + ret == 0) + goto out; + + /* Clear state set by ocfs2_get_block. */ + bh_result->b_state = 0; + } + + dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block); + if (unlikely(dwc == NULL)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) > + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) && + !dwc->dw_orphaned) { + /* + * when we are going to alloc extents beyond file size, add the + * inode to orphan dir, so we can recall those spaces when + * system crashed during write. + */ + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + dwc->dw_orphaned = 1; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + if (first_get_block) { + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, + total_len, NULL); + if (ret < 0) { + mlog_errno(ret); + goto unlock; + } + } + + ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len, + OCFS2_WRITE_DIRECT, NULL, + (void **)&wc, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + desc = &wc->w_desc[0]; + + p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys); + BUG_ON(p_blkno == 0); + p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1); + + map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = len; + if (desc->c_needs_zero) + set_buffer_new(bh_result); + + /* May sleep in end_io. It should not happen in a irq context. So defer + * it to dio work queue. */ + set_buffer_defer_completion(bh_result); + + if (!list_empty(&wc->w_unwritten_list)) { + struct ocfs2_unwritten_extent *ue = NULL; + + ue = list_first_entry(&wc->w_unwritten_list, + struct ocfs2_unwritten_extent, + ue_node); + BUG_ON(ue->ue_cpos != desc->c_cpos); + /* The physical address may be 0, fill it. */ + ue->ue_phys = desc->c_phys; + + list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); + dwc->dw_zero_count++; + } + + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + BUG_ON(ret != len); + ret = 0; +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + if (ret < 0) + ret = -EIO; + return ret; +} + +static void ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_unwritten_extent *ue = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + loff_t end = offset + bytes; + int ret = 0, credits = 0, locked = 0; + + ocfs2_init_dealloc_ctxt(&dealloc); + + /* We do clear unwritten, delete orphan, change i_size here. If neither + * of these happen, we can skip all this. */ + if (list_empty(&dwc->dw_zero_list) && + end <= i_size_read(inode) && + !dwc->dw_orphaned) + goto out; + + /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we + * are in that context. */ + if (dwc->dw_writer_pid != task_pid_nr(current)) { + mutex_lock(&inode->i_mutex); + locked = 1; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + down_write(&oi->ip_alloc_sem); + + /* Delete orphan before acquire i_mutex. */ + if (dwc->dw_orphaned) { + BUG_ON(dwc->dw_writer_pid != task_pid_nr(current)); + + end = end > i_size_read(inode) ? end : 0; + + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, + !!end, end); + if (ret < 0) + mlog_errno(ret); + } + + di = (struct ocfs2_dinode *)di_bh; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto unlock; + } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) { + ret = ocfs2_mark_extent_written(inode, &et, handle, + ue->ue_cpos, 1, + ue->ue_phys, + meta_ac, &dealloc); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + if (end > i_size_read(inode)) { + ret = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (ret < 0) + mlog_errno(ret); + } +commit: + ocfs2_commit_trans(osb, handle); +unlock: + up_write(&oi->ip_alloc_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + ocfs2_run_deallocs(osb, &dealloc); + if (locked) + mutex_unlock(&inode->i_mutex); + ocfs2_dio_free_write_ctx(inode, dwc); +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. + */ +static int ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + int level; + + if (bytes <= 0) + return 0; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + + if (private) + ocfs2_dio_end_io_write(inode, private, offset, bytes); + + ocfs2_iocb_clear_rw_locked(iocb); + + level = ocfs2_iocb_rw_locked_level(iocb); + ocfs2_rw_unlock(inode, level); + return 0; +} + +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + loff_t end = offset + iter->count; + get_block_t *get_block; + + /* + * Fallback to buffered I/O if we see an inode without + * extents. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; + + /* Fallback to buffered I/O if we do not support append dio. */ + if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) + return 0; + + if (iov_iter_rw(iter) == READ) + get_block = ocfs2_get_block; + else + get_block = ocfs2_dio_get_block; + + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, get_block, + ocfs2_dio_end_io, NULL, 0); +} + const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .readpages = ocfs2_readpages, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 24e496d6b..b1c9f28a5 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct file *filp, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, +typedef enum { + OCFS2_WRITE_BUFFER = 0, + OCFS2_WRITE_DIRECT, + OCFS2_WRITE_MMAP, +} ocfs2_write_type_t; + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); @@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, - OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_unaligned_aio(iocb) \ - set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_unaligned_aio(iocb) \ - clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_unaligned_aio(iocb) \ - test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a2370e2c7..1934abb6b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt { static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work) jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -418,13 +417,13 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - vec_start = (cs << bits) % PAGE_CACHE_SIZE; + vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE - vec_start, - (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + vec_len = min(PAGE_SIZE - vec_start, + (max_slots-cs) * (PAGE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", current_page, vec_len, vec_start); @@ -432,7 +431,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, len = bio_add_page(bio, page, vec_len, vec_start); if (len != vec_len) break; - cs += vec_len / (PAGE_CACHE_SIZE/spp); + cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; } @@ -1577,7 +1576,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) static void o2hb_init_region_params(struct o2hb_region *reg) { - reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", @@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long flags; - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index ebe543894..b17d180bd 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -630,7 +630,6 @@ static void o2nm_cluster_release(struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - kfree(cluster->cl_group.default_groups); kfree(cluster); } @@ -666,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - void *defs = NULL; /* this runs under the parent dir's i_mutex; there can be only * one caller in here at a time */ @@ -675,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL) goto out; config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); + configfs_add_default_group(&ns->ns_group, &cluster->cl_group); + config_group_init_type_name(&ns->ns_group, "node", &o2nm_node_group_type); + configfs_add_default_group(o2hb_group, &cluster->cl_group); - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = o2hb_group; - cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; @@ -704,7 +700,6 @@ out: kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); - kfree(defs); ret = ERR_PTR(-ENOMEM); } @@ -714,18 +709,11 @@ out: static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - int i; - struct config_item *killme; BUG_ON(o2nm_single_cluster != cluster); o2nm_single_cluster = NULL; - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - + configfs_remove_default_groups(&cluster->cl_group); config_item_put(item); } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 68c607e63..004f2cbe8 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -451,6 +452,7 @@ enum { DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, DLM_BEGIN_EXIT_DOMAIN_MSG = 521, + DLM_DEREF_LOCKRES_DONE = 522, }; struct dlm_reco_node_data @@ -545,7 +547,7 @@ struct dlm_master_requery * }; * * from ../cluster/tcp.h - * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) * (roughly 4080 bytes) * and sizeof(dlm_migratable_lockres) = 112 bytes * and sizeof(dlm_migratable_lock) = 16 bytes @@ -586,7 +588,7 @@ struct dlm_migratable_lockres /* from above, 128 bytes * for some undetermined future use */ -#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ +#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \ DLM_MIG_LOCKRES_MAX_LEN) struct dlm_create_lock @@ -782,6 +784,20 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +enum { + DLM_DEREF_RESPONSE_DONE = 0, + DLM_DEREF_RESPONSE_INPROG = 1, +}; + +struct dlm_deref_lockres_done { + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, void dlm_assert_master_post_handler(int status, void *data, void *ret_data); int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index f90931335..cdeafb4e7 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -212,6 +212,12 @@ grant: if (lock->lksb->flags & DLM_LKSB_PUT_LVB) memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + /* + * Move the lock to the tail because it may be the only lock which has + * an invalid lvb. + */ + list_move_tail(&lock->list, &res->granted); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2ee7fe747..12e064b8b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * - Message DLM_QUERY_NODEINFO added to allow online node removes * New in version 1.2: * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + * New in version 1.3: + * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the + * refmap is cleared */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 3, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm, unsigned int map_size) { int status, tmpstat; - unsigned int node; + int node; if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))) { @@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key, + sizeof(struct dlm_deref_lockres_done), + dlm_deref_lockres_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9477d6e1d..13719d3f3 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) dlm_print_one_lock_resource(res); BUG(); } - return ret; + return ret ? ret : r; } int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, res->lockname.len, res->lockname.name, node); dlm_print_one_lock_resource(res); } - ret = 0; + ret = DLM_DEREF_RESPONSE_DONE; goto done; } @@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->work_lock); queue_work(dlm->dlm_worker, &dlm->dispatched_work); - return 0; + return DLM_DEREF_RESPONSE_INPROG; done: if (res) @@ -2375,6 +2375,124 @@ done: return ret; } +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres_done *deref + = (struct dlm_deref_lockres_done *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + dlm_lockres_put(res); + + spin_unlock(&dlm->spinlock); + + ret = 0; + +done: + dlm_put(dlm); + return ret; +} + +static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 node) +{ + struct dlm_deref_lockres_done deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, + &deref, sizeof(deref), node, &r); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " + " to node %u\n", dlm->name, namelen, + lockname, ret, node); + } else if (r < 0) { + /* ignore the error */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, node, r); + dlm_print_one_lock_resource(res); + } +} + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) { struct dlm_ctxt *dlm; @@ -2395,6 +2513,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } spin_unlock(&res->spinlock); + dlm_drop_lockres_ref_done(dlm, res, node); + if (cleared) { mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", dlm->name, res->lockname.len, res->lockname.name, node); @@ -2432,7 +2552,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; /* delay migration when the lockres is in RECOCERING state */ - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; if (res->owner != dlm->node_num) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 23d0ab881..f6b313898 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * and RECOVERY flag changed when it completes. */ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len, hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(0, "%s: node is attempting to migrate " + "lockres %.*s, but marked as dropping " + " ref!\n", dlm->name, + mres->lockname_len, mres->lockname); + ret = -EINVAL; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + goto leave; + } + if (mres->flags & DLM_MRES_RECOVERY) { res->state |= DLM_LOCK_RES_RECOVERING; } else { @@ -2162,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2299,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " @@ -2376,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "%s: res %.*s, Skip " - "recovery as it is being freed\n", - dlm->name, res->lockname.len, - res->lockname.name); - } else - dlm_move_lockres_to_recovery_list(dlm, - res); - + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + } + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, + res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c5f6c241e..68d239ba0 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ @@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } + if (!master && ret != 0) { + mlog(0, "%s: deref %.*s in progress or master goes down\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); @@ -700,7 +708,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 03768bb3a..47b3b2d4e 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb, int silent) { sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = DLMFS_MAGIC; sb->s_op = &dlmfs_ops; sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7cb38fdca..59cce53c9 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index = abs_from >> PAGE_CACHE_SHIFT; + unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); - BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); @@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } /* Get the offsets within the page that we want to zero */ - zero_from = abs_from & (PAGE_CACHE_SIZE - 1); - zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + zero_from = abs_from & (PAGE_SIZE - 1); + zero_to = abs_to & (PAGE_SIZE - 1); if (!zero_to) - zero_to = PAGE_CACHE_SIZE; + zero_to = PAGE_SIZE; trace_ocfs2_write_zero_page( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, out_unlock: unlock_page(page); - page_cache_release(page); + put_page(page); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, BUG_ON(range_start >= range_end); while (zero_pos < range_end) { - next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE; if (next_pos > range_end) next_pos = range_end; rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); @@ -1268,20 +1268,20 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = posix_acl_chmod(inode, inode->i_mode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } if (inode_locked) ocfs2_inode_unlock(inode, 1); + brelse(bh); return status; } @@ -1381,44 +1381,6 @@ out: return ret; } -/* - * Will look for holes and unwritten extents in the range starting at - * pos for count bytes (inclusive). - */ -static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, - size_t count) -{ - int ret = 0; - unsigned int extent_flags; - u32 cpos, clusters, extent_len, phys_cpos; - struct super_block *sb = inode->i_sb; - - cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; - clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; - - while (clusters) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, - &extent_flags); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { - ret = 1; - break; - } - - if (extent_len > clusters) - extent_len = clusters; - - clusters -= extent_len; - cpos += extent_len; - } -out: - return ret; -} - static int ocfs2_write_remove_suid(struct inode *inode) { int ret; @@ -2129,18 +2091,12 @@ out: static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, - size_t count, - int appending, - int *direct_io, - int *has_refcount) + size_t count) { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); loff_t end; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int full_coherency = !(osb->s_mount_opt & - OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, pos, count, &meta_level); - if (has_refcount) - *has_refcount = 1; - if (direct_io) - *direct_io = 0; } if (ret < 0) { @@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file, goto out_unlock; } - /* - * Skip the O_DIRECT checks if we don't need - * them. - */ - if (!direct_io || !(*direct_io)) - break; - - /* - * There's no sane way to do direct writes to an inode - * with inline data. - */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - *direct_io = 0; - break; - } - - /* - * Allowing concurrent direct writes means - * i_size changes wouldn't be synchronized, so - * one node could wind up truncating another - * nodes writes. - */ - if (end > i_size_read(inode) && !full_coherency) { - *direct_io = 0; - break; - } - - /* - * Fallback to old way if the feature bit is not set. - */ - if (end > i_size_read(inode) && - !ocfs2_supports_append_dio(osb)) { - *direct_io = 0; - break; - } - - /* - * We don't fill holes during direct io, so - * check for them here. If any are found, the - * caller will have to retake some cluster - * locks and initiate the io as buffered. - */ - ret = ocfs2_check_range_for_holes(inode, pos, count); - if (ret == 1) { - /* - * Fallback to old way if the feature bit is not set. - * Otherwise try dio first and then complete the rest - * request through buffer io. - */ - if (!ocfs2_supports_append_dio(osb)) - *direct_io = 0; - ret = 0; - } else if (ret < 0) - mlog_errno(ret); break; } out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, appending, count, - direct_io, has_refcount); + pos, count); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2272,18 +2169,16 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, appending, rw_level; - int can_do_direct, has_refcount = 0; + int direct_io, rw_level; ssize_t written = 0; ssize_t ret; - size_t count = iov_iter_count(from), orig_count; + size_t count = iov_iter_count(from); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); - int unaligned_dio = 0; - int dropped_dio = 0; + void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); @@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, if (count == 0) return 0; - appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; inode_lock(inode); -relock: /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2334,7 +2227,6 @@ relock: ocfs2_inode_unlock(inode, 1); } - orig_count = iov_iter_count(from); ret = generic_write_checks(iocb, from); if (ret <= 0) { if (ret) @@ -2343,41 +2235,18 @@ relock: } count = ret; - can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, - &can_do_direct, &has_refcount); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); if (ret < 0) { mlog_errno(ret); goto out; } - if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); - - /* - * We can't complete the direct I/O as requested, fall back to - * buffered I/O. - */ - if (direct_io && !can_do_direct) { - ocfs2_rw_unlock(inode, rw_level); - - rw_level = -1; - - direct_io = 0; - iocb->ki_flags &= ~IOCB_DIRECT; - iov_iter_reexpand(from, orig_count); - dropped_dio = 1; - goto relock; - } - - if (unaligned_dio) { + if (direct_io && !is_sync_kiocb(iocb) && + ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) { /* - * Wait on previous unaligned aio to complete before - * proceeding. + * Make it a sync io if it's an unaligned aio. */ - mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); - /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ - ocfs2_iocb_set_unaligned_aio(iocb); + saved_ki_complete = xchg(&iocb->ki_complete, NULL); } /* communicate with ocfs2_dio_end_io */ @@ -2398,14 +2267,13 @@ relock: */ if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; - unaligned_dio = 0; } if (unlikely(written <= 0)) - goto no_sync; + goto out; if (((file->f_flags & O_DSYNC) && !direct_io) || - IS_SYNC(inode) || dropped_dio) { + IS_SYNC(inode)) { ret = filemap_fdatawrite_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1); @@ -2424,13 +2292,10 @@ relock: iocb->ki_pos - 1); } -no_sync: - if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) { - ocfs2_iocb_clear_unaligned_aio(iocb); - mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); - } - out: + if (saved_ki_complete) + xchg(&iocb->ki_complete, saved_ki_complete); + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c new file mode 100644 index 000000000..2cabbcf2f --- /dev/null +++ b/fs/ocfs2/filecheck.c @@ -0,0 +1,606 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.c + * + * Code which implements online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/sysctl.h> +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_fs.h" +#include "stackglue.h" +#include "inode.h" + +#include "filecheck.h" + + +/* File check error strings, + * must correspond with error number in header file. + */ +static const char * const ocfs2_filecheck_errs[] = { + "SUCCESS", + "FAILED", + "INPROGRESS", + "READONLY", + "INJBD", + "INVALIDINO", + "BLOCKECC", + "BLOCKNO", + "VALIDFLAG", + "GENERATION", + "UNSUPPORTED" +}; + +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); +static LIST_HEAD(ocfs2_filecheck_sysfs_list); + +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_entry { + struct list_head fe_list; + unsigned long fe_ino; + unsigned int fe_type; + unsigned int fe_done:1; + unsigned int fe_status:31; +}; + +struct ocfs2_filecheck_args { + unsigned int fa_type; + union { + unsigned long fa_ino; + unsigned int fa_len; + }; +}; + +static const char * +ocfs2_filecheck_error(int errno) +{ + if (!errno) + return ocfs2_filecheck_errs[errno]; + + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || + errno > OCFS2_FILECHECK_ERR_END); + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_attr_filecheck_chk = + __ATTR(check, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_fix = + __ATTR(fix, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_set = + __ATTR(set, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); + +static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +{ + schedule(); + return 0; +} + +static void +ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) +{ + struct ocfs2_filecheck_entry *p; + + if (!atomic_dec_and_test(&entry->fs_count)) + wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, + TASK_UNINTERRUPTIBLE); + + spin_lock(&entry->fs_fcheck->fc_lock); + while (!list_empty(&entry->fs_fcheck->fc_head)) { + p = list_first_entry(&entry->fs_fcheck->fc_head, + struct ocfs2_filecheck_entry, fe_list); + list_del(&p->fe_list); + BUG_ON(!p->fe_done); /* To free a undone file check entry */ + kfree(p); + } + spin_unlock(&entry->fs_fcheck->fc_lock); + + kset_unregister(entry->fs_fcheckkset); + kset_unregister(entry->fs_devicekset); + kfree(entry->fs_fcheck); + kfree(entry); +} + +static void +ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) +{ + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); +} + +static int ocfs2_filecheck_sysfs_del(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + list_del(&p->fs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + ocfs2_filecheck_sysfs_free(p); + return 0; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return 1; +} + +static void +ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) +{ + if (atomic_dec_and_test(&entry->fs_count)) + wake_up_atomic_t(&entry->fs_count); +} + +static struct ocfs2_filecheck_sysfs_entry * +ocfs2_filecheck_sysfs_get(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p = NULL; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + atomic_inc(&p->fs_count); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return p; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return NULL; +} + +int ocfs2_filecheck_create_sysfs(struct super_block *sb) +{ + int ret = 0; + struct kset *device_kset = NULL; + struct kset *fcheck_kset = NULL; + struct ocfs2_filecheck *fcheck = NULL; + struct ocfs2_filecheck_sysfs_entry *entry = NULL; + struct attribute **attrs = NULL; + struct attribute_group attrgp; + + if (!ocfs2_kset) + return -ENOMEM; + + attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); + if (!attrs) { + ret = -ENOMEM; + goto error; + } else { + attrs[0] = &ocfs2_attr_filecheck_chk.attr; + attrs[1] = &ocfs2_attr_filecheck_fix.attr; + attrs[2] = &ocfs2_attr_filecheck_set.attr; + attrs[3] = NULL; + memset(&attrgp, 0, sizeof(attrgp)); + attrgp.attrs = attrs; + } + + fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); + if (!fcheck) { + ret = -ENOMEM; + goto error; + } else { + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + } + + if (strlen(sb->s_id) <= 0) { + mlog(ML_ERROR, + "Cannot get device basename when create filecheck sysfs\n"); + ret = -ENODEV; + goto error; + } + + device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); + if (!device_kset) { + ret = -ENOMEM; + goto error; + } + + fcheck_kset = kset_create_and_add("filecheck", NULL, + &device_kset->kobj); + if (!fcheck_kset) { + ret = -ENOMEM; + goto error; + } + + ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); + if (ret) + goto error; + + entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto error; + } else { + atomic_set(&entry->fs_count, 1); + entry->fs_sb = sb; + entry->fs_devicekset = device_kset; + entry->fs_fcheckkset = fcheck_kset; + entry->fs_fcheck = fcheck; + ocfs2_filecheck_sysfs_add(entry); + } + + kfree(attrs); + return 0; + +error: + kfree(attrs); + kfree(entry); + kfree(fcheck); + kset_unregister(fcheck_kset); + kset_unregister(device_kset); + return ret; +} + +int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +{ + return ocfs2_filecheck_sysfs_del(sb->s_id); +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count); +static int +ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int len) +{ + int ret; + + if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE)) + return -EINVAL; + + spin_lock(&ent->fs_fcheck->fc_lock); + if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { + mlog(ML_ERROR, + "Cannot set online file check maximum entry number " + "to %u due to too many pending entries(%u)\n", + len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); + ret = -EBUSY; + } else { + if (len < ent->fs_fcheck->fc_size) + BUG_ON(!ocfs2_filecheck_erase_entries(ent, + ent->fs_fcheck->fc_size - len)); + + ent->fs_fcheck->fc_max = len; + ret = 0; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + return ret; +} + +#define OCFS2_FILECHECK_ARGS_LEN 24 +static int +ocfs2_filecheck_args_get_long(const char *buf, size_t count, + unsigned long *val) +{ + char buffer[OCFS2_FILECHECK_ARGS_LEN]; + + memcpy(buffer, buf, count); + buffer[count] = '\0'; + + if (kstrtoul(buffer, 0, val)) + return 1; + + return 0; +} + +static int +ocfs2_filecheck_type_parse(const char *name, unsigned int *type) +{ + if (!strncmp(name, "fix", 4)) + *type = OCFS2_FILECHECK_TYPE_FIX; + else if (!strncmp(name, "check", 6)) + *type = OCFS2_FILECHECK_TYPE_CHK; + else if (!strncmp(name, "set", 4)) + *type = OCFS2_FILECHECK_TYPE_SET; + else + return 1; + + return 0; +} + +static int +ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, + struct ocfs2_filecheck_args *args) +{ + unsigned long val = 0; + unsigned int type; + + /* too short/long args length */ + if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN)) + return 1; + + if (ocfs2_filecheck_type_parse(name, &type)) + return 1; + if (ocfs2_filecheck_args_get_long(buf, count, &val)) + return 1; + + if (val <= 0) + return 1; + + args->fa_type = type; + if (type == OCFS2_FILECHECK_TYPE_SET) + args->fa_len = (unsigned int)val; + else + args->fa_ino = val; + + return 0; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + unsigned int type; + struct ocfs2_filecheck_entry *p; + struct ocfs2_filecheck_sysfs_entry *ent; + + if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) + return -EINVAL; + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->name); + return -ENODEV; + } + + if (type == OCFS2_FILECHECK_TYPE_SET) { + spin_lock(&ent->fs_fcheck->fc_lock); + total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); + spin_unlock(&ent->fs_fcheck->fc_lock); + goto exit; + } + + ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n"); + total += ret; + remain -= ret; + spin_lock(&ent->fs_fcheck->fc_lock); + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_type != type) + continue; + + ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", + p->fe_ino, p->fe_done, + ocfs2_filecheck_error(p->fe_status)); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return total; +} + +static int +ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_done) { + list_del(&p->fe_list); + kfree(p); + ent->fs_fcheck->fc_size--; + ent->fs_fcheck->fc_done--; + return 1; + } + } + + return 0; +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count) +{ + unsigned int i = 0; + unsigned int ret = 0; + + while (i++ < count) { + if (ocfs2_filecheck_erase_entry(ent)) + ret++; + else + break; + } + + return (ret == count ? 1 : 0); +} + +static void +ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + entry->fe_done = 1; + spin_lock(&ent->fs_fcheck->fc_lock); + ent->fs_fcheck->fc_done++; + spin_unlock(&ent->fs_fcheck->fc_lock); +} + +static unsigned int +ocfs2_filecheck_handle(struct super_block *sb, + unsigned long ino, unsigned int flags) +{ + unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; + struct inode *inode = NULL; + int rc; + + inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + if (IS_ERR(inode)) { + rc = (int)(-(long)inode); + if (rc >= OCFS2_FILECHECK_ERR_START && + rc < OCFS2_FILECHECK_ERR_END) + ret = rc; + else + ret = OCFS2_FILECHECK_ERR_FAILED; + } else + iput(inode); + + return ret; +} + +static void +ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); + else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); + else + entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; + + ocfs2_filecheck_done_entry(ent, entry); +} + +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct ocfs2_filecheck_args args; + struct ocfs2_filecheck_entry *entry; + struct ocfs2_filecheck_sysfs_entry *ent; + ssize_t ret = 0; + + if (count == 0) + return count; + + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { + mlog(ML_ERROR, "Invalid arguments for online file check\n"); + return -EINVAL; + } + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->parent->name); + return -ENODEV; + } + + if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { + ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); + goto exit; + } + + entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + spin_lock(&ent->fs_fcheck->fc_lock); + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_ERROR, + "Cannot do more file check " + "since file check queue(%u) is full now\n", + ent->fs_fcheck->fc_max); + ret = -EBUSY; + kfree(entry); + } else { + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done > 0)) { + /* Delete the oldest entry which was done, + * make sure the entry size in list does + * not exceed maximum value + */ + BUG_ON(!ocfs2_filecheck_erase_entry(ent)); + } + + entry->fe_ino = args.fa_ino; + entry->fe_type = args.fa_type; + entry->fe_done = 0; + entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS; + list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head); + ent->fs_fcheck->fc_size++; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + if (!ret) + ocfs2_filecheck_handle_entry(ent, entry); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return (!ret ? count : ret); +} diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h new file mode 100644 index 000000000..e5cd002a2 --- /dev/null +++ b/fs/ocfs2/filecheck.h @@ -0,0 +1,49 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.h + * + * Online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef FILECHECK_H +#define FILECHECK_H + +#include <linux/types.h> +#include <linux/list.h> + + +/* File check errno */ +enum { + OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */ + OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */ + OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */ + OCFS2_FILECHECK_ERR_READONLY, /* Read only */ + OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */ + OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */ + OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */ + OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */ + OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */ + OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */ + OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */ +}; + +#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED +#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED + +int ocfs2_filecheck_create_sysfs(struct super_block *sb); +int ocfs2_filecheck_remove_sysfs(struct super_block *sb); + +#endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 36294446d..12f4a9e98 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" +#include "filecheck.h" #include "buffer_head_io.h" @@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); +static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type); +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh); + void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; @@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { + int rc = 0; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; @@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { - ocfs2_read_locked_inode(inode, &args); + rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); - inode = ERR_PTR(-ESTALE); + if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || + (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) + /* Return OCFS2_FILECHECK_ERR_XXX related errno */ + inode = ERR_PTR(rc); + else + inode = ERR_PTR(-ESTALE); goto bail; } @@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status, can_lock; + int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; @@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_inode_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode, } if (can_lock) { - status = ocfs2_read_inode_block_full(inode, &bh, - OCFS2_BH_IGNORE_CACHE); + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 0); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 1); + else + status = ocfs2_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ - if (!status && !buffer_jbd(bh)) - status = ocfs2_validate_inode_block(osb->sb, bh); + if (!status && !buffer_jbd(bh)) { + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_validate_inode_block( + osb->sb, bh); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_repair_inode_block( + osb->sb, bh); + else + status = ocfs2_validate_inode_block( + osb->sb, bh); + } } if (status < 0) { mlog_errno(status); @@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode, BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + if (buffer_dirty(bh) && !buffer_jbd(bh)) { + if (can_lock) { + ocfs2_inode_unlock(inode, lock_level); + lock_level = 1; + ocfs2_inode_lock(inode, NULL, lock_level); + } + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = 0; bail: if (can_lock) - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); @@ -1126,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); + mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), + "Clear inode of %llu, inode has unwritten extents\n", + (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); @@ -1397,6 +1444,169 @@ bail: return rc; } +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_filecheck_validate_inode_block( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * Call ocfs2_validate_meta_ecc() first since it has ecc repair + * function, but we should not return error immediately when ecc + * validation fails, because the reason is quite likely the invalid + * inode number inputed. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, + "Filecheck: checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_BLOCKECC; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, di->i_signature); + rc = -OCFS2_FILECHECK_ERR_INVALIDINO; + goto bail; + } else if (rc) + goto bail; + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = -OCFS2_FILECHECK_ERR_BLOCKNO; + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " + "not set\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + rc = -OCFS2_FILECHECK_ERR_GENERATION; + goto bail; + } + +bail: + return rc; +} + +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int changed = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + if (!ocfs2_filecheck_validate_inode_block(sb, bh)) + return 0; + + trace_ocfs2_filecheck_repair_inode_block( + (unsigned long long)bh->b_blocknr); + + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu " + "on readonly filesystem\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_READONLY; + } + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu, " + "its buffer is in jbd\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_INJBD; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + /* Cannot fix invalid inode block */ + return -OCFS2_FILECHECK_ERR_INVALIDINO; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + /* Cannot just add VALID_FL flag back as a fix, + * need more things to check here. + */ + return -OCFS2_FILECHECK_ERR_VALIDFLAG; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + di->i_blkno = cpu_to_le64(bh->b_blocknr); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: fs_generation to %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + } + + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { + ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); + mark_buffer_dirty(bh); + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: compute meta ecc\n", + (unsigned long long)bh->b_blocknr); + } + + return 0; +} + +static int +ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type) +{ + int rc; + struct buffer_head *tmp = *bh; + + if (!type) /* Check inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_validate_inode_block); + else /* Repair inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_repair_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index aac8b86f3..d8f3fc8d2 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -43,9 +43,6 @@ struct ocfs2_inode_info /* protects extended attribute changes on this inode */ struct rw_semaphore ip_xattr_sem; - /* Number of outstanding AIO's which are not page aligned */ - struct mutex ip_unaligned_aio; - /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -57,6 +54,9 @@ struct ocfs2_inode_info u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ + /* Record unwritten extents during direct io. */ + struct list_head ip_unwritten_list; + /* protected by recovery_lock. */ struct inode *ip_next_orphan; @@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4 +#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8 + struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 61b833b72..e607419cd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* At this point, we know that no more recovery threads can be * launched, so wait for any recovery completion work to * complete. */ - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); /* * Now that recovery is shut down, and the osb is about to be @@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); - queue_work(ocfs2_wq, &journal->j_recovery_work); + queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work); spin_unlock(&journal->j_lock); } @@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); } } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7d62c43a2..fe0d1f957 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(ocfs2_wq); + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb, } else { osb->local_alloc_state = OCFS2_LA_DISABLED; } - queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq, OCFS2_LA_ENABLE_INTERVAL); goto out_unlock; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 77ebc2bc1..71545ad46 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); - unsigned int len = PAGE_CACHE_SIZE; + unsigned int len = PAGE_SIZE; pgoff_t last_index; struct page *locked_page = NULL; void *fsdata; loff_t size = i_size_read(inode); - last_index = (size - 1) >> PAGE_CACHE_SHIFT; + last_index = (size - 1) >> PAGE_SHIFT; /* * There are cases that lead to the page no longer bebongs to the @@ -102,10 +102,10 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; + len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, - &fsdata, di_bh, page); + ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6b3e87189..a8f1225e6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; - struct posix_acl *default_acl = NULL, *acl = NULL; struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, @@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (status) { - mlog_errno(status); - goto leave; - } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - if (default_acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_DEFAULT, default_acl, - meta_ac, data_ac); - } - if (!status && acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_ACCESS, acl, - meta_ac, data_ac); - } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); if (status < 0) { mlog_errno(status); @@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7a0126267..e63af7ddf 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -464,6 +464,14 @@ struct ocfs2_super struct ocfs2_refcount_tree *osb_ref_tree_lru; struct mutex system_file_mutex; + + /* + * OCFS2 needs to schedule several different types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. + */ + struct workqueue_struct *ocfs2_wq; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -814,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, u32 clusters = pg_index; unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; - if (unlikely(PAGE_CACHE_SHIFT > cbits)) - clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); - else if (PAGE_CACHE_SHIFT < cbits) - clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + if (unlikely(PAGE_SHIFT > cbits)) + clusters = pg_index << (PAGE_SHIFT - cbits); + else if (PAGE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_SHIFT); return clusters; } @@ -831,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; pgoff_t index = clusters; - if (PAGE_CACHE_SHIFT > cbits) { - index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); - } else if (PAGE_CACHE_SHIFT < cbits) { - index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT > cbits) { + index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits); + } else if (PAGE_SHIFT < cbits) { + index = (pgoff_t)clusters << (cbits - PAGE_SHIFT); } return index; @@ -845,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; unsigned int pages_per_cluster = 1; - if (PAGE_CACHE_SHIFT < cbits) - pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + if (PAGE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_SHIFT); return pages_per_cluster; } diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 6cb019b7c..f8f5fc5e6 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - int appending, unsigned long count, - int *direct_io, int *has_refcount), - TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), + unsigned long count), + TP_ARGS(ino, saved_pos, count), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) - __field(int, appending) __field(unsigned long, count) - __field(int, direct_io) - __field(int, has_refcount) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; - __entry->appending = appending; __entry->count = count; - __entry->direct_io = direct_io ? *direct_io : -1; - __entry->has_refcount = has_refcount ? *has_refcount : -1; ), - TP_printk("%llu %llu %d %lu %d %d", __entry->ino, - __entry->saved_pos, __entry->appending, __entry->count, - __entry->direct_io, __entry->has_refcount) + TP_printk("%llu %llu %lu", __entry->ino, + __entry->saved_pos, __entry->count) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); @@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block); TRACE_EVENT(ocfs2_inode_is_valid_to_delete, TP_PROTO(void *task, void *dc_task, unsigned long long ino, @@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot); DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot); +DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id); + DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty); /* End of trace events for fs/ocfs2/quota_global.c. */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 9c9dd30bc..ab6a6cdcf 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) dqgrab(dquot); /* First entry on list -> queue work */ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) - queue_work(ocfs2_wq, &osb->dquot_drop_work); + queue_work(osb->ocfs2_wq, &osb->dquot_drop_work); goto out; } status = ocfs2_lock_global_qf(oinfo, 1); @@ -860,6 +860,37 @@ out: return status; } +static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) +{ + int type = qid->type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + int status = 0; + + trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); + if (!sb_has_quota_loaded(sb, type)) { + status = -ESRCH; + goto out; + } + status = ocfs2_lock_global_qf(info, 0); + if (status < 0) + goto out; + status = ocfs2_qinfo_lock(info, 0); + if (status < 0) + goto out_global; + status = qtree_get_next_id(&info->dqi_gi, qid); + ocfs2_qinfo_unlock(info, 0); +out_global: + ocfs2_unlock_global_qf(info, 0); +out: + /* + * Avoid logging ENOENT since it just means there isn't next ID and + * ESRCH which means quota isn't enabled for the filesystem. + */ + if (status && status != -ENOENT && status != -ESRCH) + mlog_errno(status); + return status; +} + static int ocfs2_mark_dquot_dirty(struct dquot *dquot) { unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | @@ -968,4 +999,5 @@ const struct dquot_operations ocfs2_quota_operations = { .write_info = ocfs2_write_info, .alloc_dquot = ocfs2_alloc_dquot, .destroy_dquot = ocfs2_destroy_dquot, + .get_next_id = ocfs2_get_next_id, }; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3eff031aa..92bbe93bf 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, end = i_size_read(inode); while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ - from = offset & (PAGE_CACHE_SIZE - 1); - to = PAGE_CACHE_SIZE; - if (map_end & (PAGE_CACHE_SIZE - 1)) - to = map_end & (PAGE_CACHE_SIZE - 1); + from = offset & (PAGE_SIZE - 1); + to = PAGE_SIZE; + if (map_end & (PAGE_SIZE - 1)) + to = map_end & (PAGE_SIZE - 1); page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { @@ -2956,10 +2956,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * In case PAGE_SIZE <= CLUSTER_SIZE, This page * can't be dirtied before we CoW it out. */ - if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); if (!PageUptodate(page)) { @@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, mark_page_accessed(page); unlock: unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, } while (offset < end) { - page_index = offset >> PAGE_CACHE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; + page_index = offset >> PAGE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; if (map_end > end) map_end = end; @@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, mark_page_accessed(page); unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; offset = map_end; if (ret) @@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; - struct posix_acl *default_acl, *acl; - umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - mode = inode->i_mode; - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) { - mlog_errno(error); - return error; - } - error = ocfs2_create_inode_in_orphan(dir, mode, + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name, - default_acl, acl); + &new_dentry->d_name); if (error) mlog_errno(error); } out: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 576b9a048..18451e0fa 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { blkno = ocfs2_backup_super_blkno(inode->i_sb, i); cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - if (cluster > clusters) + if (cluster >= clusters) break; ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5d965e83b..13219ed73 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -static struct kset *ocfs2_kset; +struct kset *ocfs2_kset; +EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 66334a30c..f2dce10fa 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +extern struct kset *ocfs2_kset; + #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index faa136509..d7cae3327 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -74,17 +74,12 @@ #include "suballoc.h" #include "buffer_head_io.h" +#include "filecheck.h" static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; -/* OCFS2 needs to schedule several different types of work which - * require cluster locking, disk I/O, recovery waits, etc. Since these - * types of work tend to be heavy we avoid using the kernel events - * workqueue and schedule on our own. */ -struct workqueue_struct *ocfs2_wq = NULL; - static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); @@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; + unsigned long flags; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) cconn->cc_version.pv_minor); } - spin_lock(&osb->dc_task_lock); + spin_lock_irqsave(&osb->dc_task_lock, flags); out += snprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); - spin_unlock(&osb->dc_task_lock); + spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", @@ -609,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, /* * We might be limited by page cache size. */ - if (bytes > PAGE_CACHE_SIZE) { - bytes = PAGE_CACHE_SIZE; + if (bytes > PAGE_SIZE) { + bytes = PAGE_SIZE; trim = 1; /* * Shift by 31 here so that we don't get larger than @@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); + /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ + ocfs2_filecheck_create_sysfs(sb); + return status; read_super_error: @@ -1608,33 +1607,25 @@ static int __init ocfs2_init(void) if (status < 0) goto out2; - ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); - if (!ocfs2_wq) { - status = -ENOMEM; - goto out3; - } - ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); if (!ocfs2_debugfs_root) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); - goto out4; + goto out3; } ocfs2_set_locking_protocol(); status = register_quota_format(&ocfs2_quota_format); if (status < 0) - goto out4; + goto out3; status = register_filesystem(&ocfs2_fs_type); if (!status) return 0; unregister_quota_format(&ocfs2_quota_format); -out4: - destroy_workqueue(ocfs2_wq); - debugfs_remove(ocfs2_debugfs_root); out3: + debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); out2: exit_ocfs2_uptodate_cache(); @@ -1645,11 +1636,6 @@ out1: static void __exit ocfs2_exit(void) { - if (ocfs2_wq) { - flush_workqueue(ocfs2_wq); - destroy_workqueue(ocfs2_wq); - } - unregister_quota_format(&ocfs2_quota_format); debugfs_remove(ocfs2_debugfs_root); @@ -1667,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); + ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1739,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data) spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); + INIT_LIST_HEAD(&oi->ip_unwritten_list); oi->ip_dir_start_lookup = 0; - mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -2343,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb, } cleancache_init_shared_fs(sb); + osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!osb->ocfs2_wq) { + status = -ENOMEM; + mlog_errno(status); + } + bail: return status; } @@ -2530,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) { /* This function assumes that the caller has the main osb resource */ + /* ocfs2_initializer_super have already created this workqueue */ + if (osb->ocfs2_wq) { + flush_workqueue(osb->ocfs2_wq); + destroy_workqueue(osb->ocfs2_wq); + } + ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b477d0b1c..b023e4f3d 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -26,8 +26,6 @@ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -extern struct workqueue_struct *ocfs2_wq; - int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, int node_num); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 7d3d979f5..f19b7381a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7216,12 +7216,10 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl) + const struct qstr *qstr) { - struct buffer_head *dir_bh = NULL; int ret = 0; + struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - - if (!ret && default_acl) - ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (!ret && acl) - ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index f10d5b93c..1633cc15e 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl); + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ |