diff options
70 files changed, 1344 insertions, 696 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 8a170546a..69604e41c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12964,11 +12964,10 @@ F: arch/x86/xen/*swiotlb* F: drivers/xen/*swiotlb* XFS FILESYSTEM -P: Silicon Graphics Inc M: Dave Chinner <david@fromorbit.com> -M: xfs@oss.sgi.com -L: xfs@oss.sgi.com -W: http://oss.sgi.com/projects/xfs +M: linux-xfs@vger.kernel.org +L: linux-xfs@vger.kernel.org +W: http://xfs.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git S: Supported F: Documentation/filesystems/xfs.txt @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 8 -SUBLEVEL = 3 +SUBLEVEL = 4 EXTRAVERSION = -gnu NAME = Psychotic Stoned Sheep @@ -612,6 +612,12 @@ endif # $(dot-config) # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +# force no-pie for distro compilers that enable pie by default +KBUILD_CFLAGS += $(call cc-option, -fno-pie) +KBUILD_CFLAGS += $(call cc-option, -no-pie) +KBUILD_AFLAGS += $(call cc-option, -fno-pie) +KBUILD_CPPFLAGS += $(call cc-option, -fno-pie) + # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default # values of the respective KBUILD_* variables ARCH_CPPFLAGS := diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h index d1ec7f6b3..e880dfa3f 100644 --- a/arch/arc/include/asm/irqflags-arcv2.h +++ b/arch/arc/include/asm/irqflags-arcv2.h @@ -112,7 +112,7 @@ static inline long arch_local_save_flags(void) */ temp = (1 << 5) | ((!!(temp & STATUS_IE_MASK)) << CLRI_STATUS_IE_BIT) | - (temp & CLRI_STATUS_E_MASK); + ((temp >> 1) & CLRI_STATUS_E_MASK); return temp; } diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index 6c24faf48..62b59409a 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -74,7 +74,7 @@ void arc_init_IRQ(void) tmp = read_aux_reg(0xa); tmp |= STATUS_AD_MASK | (irq_prio << 1); tmp &= ~STATUS_IE_MASK; - asm volatile("flag %0 \n"::"r"(tmp)); + asm volatile("kflag %0 \n"::"r"(tmp)); } static void arcv2_irq_mask(struct irq_data *data) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9190a5554..eef6ff49c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -83,7 +83,7 @@ static const int bfq_back_max = 16 * 1024; static const int bfq_back_penalty = 2; /* Idling period duration, in ns. */ -static u64 bfq_slice_idle = NSEC_PER_SEC / 125; +static u32 bfq_slice_idle = NSEC_PER_SEC / 125; /* Minimum number of assigned budgets for which stats are safe to compute. */ static const int bfq_stats_min_budgets = 194; @@ -103,7 +103,7 @@ static const int bfq_timeout = HZ / 8; struct kmem_cache *bfq_pool; -/* Below this threshold (in ms), we consider thinktime immediate. */ +/* Below this threshold (in ns), we consider thinktime immediate. */ #define BFQ_MIN_TT (2 * NSEC_PER_MSEC) /* hw_tag detection: parallel requests threshold and min samples needed. */ @@ -114,8 +114,12 @@ struct kmem_cache *bfq_pool; #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -/* Min samples used for peak rate estimation (for autotuning). */ -#define BFQ_PEAK_RATE_SAMPLES 32 +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL 300*NSEC_PER_MSEC +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC /* Shift used for peak rate fixed precision calculations. */ #define BFQ_RATE_SHIFT 16 @@ -634,6 +638,23 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + + bfqq->wr_coeff = 1; + } + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -1052,6 +1073,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, * operation, is reset only when bfqq is selected for * service (see bfq_get_next_queue). */ + BUG_ON(bfqq->max_budget < 0); entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); @@ -1060,6 +1082,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return true; } + BUG_ON(bfqq->max_budget < 0); entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); BUG_ON(entity->budget < 0); @@ -1082,6 +1105,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { + bfqq->wr_start_at_switch_to_srt = jiffies; bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -1115,32 +1139,13 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, jiffies, jiffies_to_msecs(bfqq-> wr_cur_max_time)); - } else if (time_before( - bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time, - jiffies + - bfqd->bfq_wr_rt_max_time) && - soft_rt) { + } else if (soft_rt) { /* - * The remaining weight-raising time is lower - * than bfqd->bfq_wr_rt_max_time, which means - * that the application is enjoying weight - * raising either because deemed soft-rt in - * the near past, or because deemed interactive - * a long ago. - * In both cases, resetting now the current - * remaining weight-raising time for the - * application to the weight-raising duration - * for soft rt applications would not cause any - * latency increase for the application (as the - * new duration would be higher than the - * remaining time). - * - * In addition, the application is now meeting - * the requirements for being deemed soft rt. - * In the end we can correctly and safely - * (re)charge the weight-raising duration for - * the application with the weight-raising + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising * duration for soft rt applications. * * In particular, doing this recharge now, i.e., @@ -1164,14 +1169,22 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, * latency because the application is not * weight-raised while they are pending. */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); bfqq->last_wr_start_finish = jiffies; - bfqq->wr_cur_max_time = - bfqd->bfq_wr_rt_max_time; - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfq_log_bfqq(bfqd, bfqq, - "switching to soft_rt wr, or " - " just moving forward duration"); } } } @@ -1450,14 +1463,24 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, return NULL; } +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + static void bfq_activate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; - bfqd->rq_in_driver++; - bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); - bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", - (unsigned long long) bfqd->last_position); } static void bfq_deactivate_request(struct request_queue *q, struct request *rq) @@ -1622,11 +1645,16 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; bfqq->wr_cur_max_time = 0; + bfqq->last_wr_start_finish = jiffies; /* * Trigger a weight change on the next invocation of * __bfq_entity_update_weight_prio. */ bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", bfqq->bfqd->wr_busy_queues); } @@ -1934,18 +1962,24 @@ check_scheduled: static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { + struct bfq_io_cq *bic = bfqq->bic; + /* * If !bfqq->bic, the queue is already shared or its requests * have already been redirected to a shared queue; both idle window * and weight raising state have already been saved. Do nothing. */ - if (!bfqq->bic) + if (!bic) return; - bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); - bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } static void bfq_get_bic_reference(struct bfq_queue *bfqq) @@ -1984,6 +2018,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, new_bfqq->wr_coeff = bfqq->wr_coeff; new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; if (bfq_bfqq_busy(new_bfqq)) bfqd->wr_busy_queues++; new_bfqq->entity.prio_changed = 1; @@ -2116,9 +2151,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, BUG_ON(bfqq == bfqd->in_service_queue); BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); - if (bfqq->wr_coeff > 1 && + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - time_is_before_jiffies(bfqq->budget_timeout)) { + time_is_before_jiffies(bfqq->budget_timeout)) { /* * For soft real-time queues, move the start * of the weight-raising period forward by the @@ -2144,7 +2180,20 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, * request. */ bfqq->last_wr_start_finish += jiffies - - bfqq->budget_timeout; + max_t(unsigned long, bfqq->last_wr_start_finish, + bfqq->budget_timeout); + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } } bfq_set_budget_timeout(bfqd, bfqq); @@ -2172,7 +2221,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; struct bfq_io_cq *bic; - unsigned long sl; + u32 sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -2206,15 +2255,326 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) */ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && bfq_symmetric_scenario(bfqd)) - sl = min_t(u64, sl, BFQ_MIN_TT); + sl = min_t(u32, sl, BFQ_MIN_TT); bfqd->last_idling_start = ktime_get(); hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), HRTIMER_MODE_REL); bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); - bfq_log(bfqd, "arm idle: %llu/%llu ms", - div_u64(sl, NSEC_PER_MSEC), - div_u64(bfqd->bfq_slice_idle, NSEC_PER_MSEC)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); +} + +/* + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. + */ +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) +{ + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} + +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} + +void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, + div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20<<BFQ_RATE_SHIFT); + + /* + * Peak rate not updated if: + * - the percentage of sequential dispatches is below 3/4 of the + * total, and rate is below the current estimated peak rate + * - rate is unreasonably high (> 20M sectors/sec) + */ + if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<<BFQ_RATE_SHIFT) { + bfq_log(bfqd, + "update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); + + bfqd->peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); + +reset_computation: + bfq_reset_rate_computation(bfqd, rq); +} + +/* + * Update the read/write peak rate (the main quantity used for + * auto-tuning, see update_thr_responsiveness_params()). + * + * It is not trivial to estimate the peak rate (correctly): because of + * the presence of sw and hw queues between the scheduler and the + * device components that finally serve I/O requests, it is hard to + * say exactly when a given dispatched request is served inside the + * device, and for how long. As a consequence, it is hard to know + * precisely at what rate a given set of requests is actually served + * by the device. + * + * On the opposite end, the dispatch time of any request is trivially + * available, and, from this piece of information, the "dispatch rate" + * of requests can be immediately computed. So, the idea in the next + * function is to use what is known, namely request dispatch times + * (plus, when useful, request completion times), to estimate what is + * unknown, namely in-device request service rate. + * + * The main issue is that, because of the above facts, the rate at + * which a certain set of requests is dispatched over a certain time + * interval can vary greatly with respect to the rate at which the + * same requests are then served. But, since the size of any + * intermediate queue is limited, and the service scheme is lossless + * (no request is silently dropped), the following obvious convergence + * property holds: the number of requests dispatched MUST become + * closer and closer to the number of requests completed as the + * observation interval grows. This is the key property used in + * the next function to estimate the peak service rate as a function + * of the observed dispatch rate. The function assumes to be invoked + * on every request dispatch. + */ +void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) +{ + u64 now_ns = ktime_get_ns(); + + if (bfqd->peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); + + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); } /* @@ -2235,6 +2595,7 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) * incrementing bfqq->dispatched. */ bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); bfq_remove_request(rq); elv_dispatch_sort(q, rq); @@ -2474,27 +2835,15 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, bfqq->entity.budget); } -static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -{ - /* - * The max_budget calculated when autotuning is equal to the - * amount of sectors transferred in timeout at the - * estimated peak rate. - */ - return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >> - BFQ_RATE_SHIFT; -} - /* - * Update the read peak rate (quantity used for auto-tuning) as a - * function of the rate at which bfqq has been served, and check - * whether the process associated with bfqq is "slow". Return true if - * the process is slow. The slow flag is used, in addition to the - * budget timeout, to reduce the amount of service provided to seeky - * processes, and hence reduce their chances to lower the - * throughput. More details in the body of the function. + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). * - * An important observation is in order: with devices with internal + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal * queues, it is hard if ever possible to know when and for how long * an I/O request is processed by the device (apart from the trivial * I/O pattern where a new request is dispatched only after the @@ -2502,26 +2851,27 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) * the real rate at which the I/O requests of each bfq_queue are * served. In fact, for an I/O scheduler like BFQ, serving a * bfq_queue means just dispatching its requests during its service - * slot, i.e., until the budget of the queue is exhausted, or the - * queue remains idle, or, finally, a timeout fires. But, during the - * service slot of a bfq_queue, the device may be still processing - * requests of bfq_queues served in previous service slots. On the - * opposite end, the requests of the in-service bfq_queue may be - * completed after the service slot of the queue finishes. Anyway, - * unless more sophisticated solutions are used (where possible), the - * sum of the sizes of the requests dispatched during the service slot - * of a bfq_queue is probably the only approximation available for - * the service received by the bfq_queue during its service slot. And, - * as written above, this sum is the quantity used in this function to - * evaluate the peak rate. + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. */ -static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason, unsigned long *delta_ms) { - u64 bw, bwdiv10, delta_usecs, delta_ms_tmp; ktime_t delta_ktime; - int update = 0; + u32 delta_usecs; bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ if (!bfq_bfqq_sync(bfqq)) @@ -2534,129 +2884,45 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); - /* Don't trust short/unrealistic values. */ + /* don't trust short/unrealistic values. */ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { if (blk_queue_nonrot(bfqd->queue)) - *delta_ms = BFQ_MIN_TT; /* - * give same worst-case - * guarantees as - * idling for seeky - */ - else /* Charge at least one seek */ - *delta_ms = jiffies_to_msecs(bfq_slice_idle); + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); + return slow; } - delta_ms_tmp = delta_usecs; - do_div(delta_ms_tmp, NSEC_PER_MSEC); - *delta_ms = delta_ms_tmp; + *delta_ms = delta_usecs / USEC_PER_MSEC; /* - * Calculate the bandwidth for the last slice. We use a 64 bit - * value to store the peak rate, in sectors per usec in fixed - * point math. We do so to have enough precision in the estimate - * and to avoid overflows. - */ - bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; - do_div(bw, (unsigned long)delta_usecs); - - bfq_log(bfqd, "measured bw = %llu sects/sec", - (1000000*bw)>>BFQ_RATE_SHIFT); - /* - * Use only long (> 20ms) intervals to filter out spikes for - * the peak rate estimation. + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. */ if (delta_usecs > 20000) { - bool fully_sequential = bfqq->seek_history == 0; - /* - * Soft real-time queues are not good candidates for - * evaluating bw, as they are likely to be slow even - * if sequential. - */ - bool non_soft_rt = bfqq->wr_coeff == 1 || - bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time; - bool consumed_large_budget = - reason == BFQ_BFQQ_BUDGET_EXHAUSTED && - bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3; - bool served_for_long_time = - reason == BFQ_BFQQ_BUDGET_TIMEOUT || - consumed_large_budget; - - BUG_ON(bfqq->seek_history == 0 && - hweight32(bfqq->seek_history) != 0); - - if (bw > bfqd->peak_rate || - (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt && - served_for_long_time)) { - /* - * To smooth oscillations use a low-pass filter with - * alpha=9/10, i.e., - * new_rate = (9/10) * old_rate + (1/10) * bw - */ - bwdiv10 = bw; - do_div(bwdiv10, 10); - if (bwdiv10 == 0) - return false; /* bw too low to be used */ - bfqd->peak_rate *= 9; - do_div(bfqd->peak_rate, 10); - bfqd->peak_rate += bwdiv10; - update = 1; - bfq_log(bfqd, "new peak_rate = %llu sects/sec", - (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT); - } - - update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; - - if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) - bfqd->peak_rate_samples++; - - if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && - update) { - int dev_type = blk_queue_nonrot(bfqd->queue); - - if (bfqd->bfq_user_max_budget == 0) { - bfqd->bfq_max_budget = - bfq_calc_max_budget(bfqd); - bfq_log(bfqd, "new max_budget = %d", - bfqd->bfq_max_budget); - } - if (bfqd->device_speed == BFQ_BFQD_FAST && - bfqd->peak_rate < device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_SLOW; - bfqd->RT_prod = R_slow[dev_type] * - T_slow[dev_type]; - } else if (bfqd->device_speed == BFQ_BFQD_SLOW && - bfqd->peak_rate > device_speed_thresh[dev_type]) { - bfqd->device_speed = BFQ_BFQD_FAST; - bfqd->RT_prod = R_fast[dev_type] * - T_fast[dev_type]; - } - bfq_log(bfqd, - "dev_type %d dev_speed_class = %d (%d sects/sec), thresh %d setcs/sec", - dev_type, bfqd->device_speed, - bfqd->device_speed == BFQ_BFQD_FAST ? - (1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT : - (1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT, - (1000000*device_speed_thresh[dev_type])>> - BFQ_RATE_SHIFT); - } /* - * Caveat: processes doing IO in the slower disk zones - * tend to be slow(er) even if not seeky. In this - * respect, the estimated peak rate is likely to be an - * average over the disk surface. Accordingly, to not - * be too harsh with unlucky processes, a process is - * deemed slow only if its bw has been lower than half - * of the estimated peak rate. + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. */ - slow = bw < bfqd->peak_rate / 2; + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); } - bfq_log_bfqq(bfqd, bfqq, - "update_peak_rate: bw %llu sect/s, peak rate %llu, slow %d", - (1000000*bw)>>BFQ_RATE_SHIFT, - (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT, - bw < bfqd->peak_rate / 2); + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); return slow; } @@ -2785,10 +3051,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, BUG_ON(bfqq != bfqd->in_service_queue); /* - * Update device peak rate for autotuning and check whether the - * process is slow (see bfq_update_peak_rate). + * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); /* * Increase service_from_backlogged before next statement, @@ -3285,6 +3550,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) struct bfq_entity *entity = &bfqq->entity; if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqd, bfqq, "raising period dur %u/%u msec, old coeff %u, w %d(%d)", jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), @@ -3302,15 +3570,26 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) * time has elapsed from the beginning of this * weight-raising period, then end weight raising. */ - if (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; - bfq_log_bfqq(bfqd, bfqq, - "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); + if (bfq_bfqq_in_large_burst(bfqq)) bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } } } /* Update weight both if it must be raised and if it must be lowered */ @@ -3468,6 +3747,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) if (unlikely(force)) return bfq_forced_dispatch(bfqd); + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + bfqq = bfq_select_queue(bfqd); if (!bfqq) return 0; @@ -3701,9 +3995,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->pid = pid; bfqq->wr_coeff = 1; - bfqq->last_wr_start_finish = bfq_smallest_from_now(); + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); bfqq->budget_timeout = bfq_smallest_from_now(); bfqq->split_time = bfq_smallest_from_now(); + /* * Set to the value for which bfqq will not be deemed as * soft rt when it becomes backlogged. @@ -3797,7 +4093,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_ttime *ttime = &bic->ttime; u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - elapsed = min(elapsed, 2UL * bfqd->bfq_slice_idle); + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); @@ -3805,22 +4101,13 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, ttime->ttime_samples); } - static void bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct request *rq) { - sector_t sdist = 0; - - if (bfqq->last_request_pos) { - if (bfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - bfqq->last_request_pos; - else - sdist = bfqq->last_request_pos - blk_rq_pos(rq); - } - bfqq->seek_history <<= 1; - bfqq->seek_history |= (sdist > BFQQ_SEEK_THR); + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR; } /* @@ -4013,6 +4300,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; + u64 now_ns; + u32 delta_us; bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", blk_rq_sectors(rq)); @@ -4043,7 +4332,44 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) &bfqd->queue_weights_tree); } - RQ_BIC(rq)->ttime.last_end_request = ktime_get_ns(); + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) + >>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < + 1UL<<(BFQ_RATE_SHIFT - 10)) + bfq_update_rate_reset(bfqd, NULL); + bfqd->last_completion = now_ns; /* * If we are waiting to discover whether the request pattern @@ -4729,14 +5055,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, return count; } -static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -{ - if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) - return bfq_calc_max_budget(bfqd); - else - return bfq_default_max_budget; -} - static ssize_t bfq_max_budget_store(struct elevator_queue *e, const char *page, size_t count) { @@ -4745,7 +5063,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, int ret = bfq_var_store(&__data, (page), count); if (__data == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); else { if (__data > INT_MAX) __data = INT_MAX; @@ -4775,7 +5093,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, bfqd->bfq_timeout = msecs_to_jiffies(__data); if (bfqd->bfq_user_max_budget == 0) - bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); return ret; } @@ -4790,8 +5108,8 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, if (__data > 1) __data = 1; if (!bfqd->strict_guarantees && __data == 1 - && bfqd->bfq_slice_idle < msecs_to_jiffies(8)) - bfqd->bfq_slice_idle = msecs_to_jiffies(8); + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; bfqd->strict_guarantees = __data; @@ -4891,7 +5209,7 @@ static struct blkcg_policy blkcg_policy_bfq = { static int __init bfq_init(void) { int ret; - char msg[50] = "BFQ I/O-scheduler: v8r3"; + char msg[50] = "BFQ I/O-scheduler: v8r4"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -4904,14 +5222,22 @@ static int __init bfq_init(void) goto err_pol_unreg; /* - * Times to load large popular applications for the typical systems - * installed on the reference devices (see the comments before the - * definitions of the two arrays). + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. */ - T_slow[0] = msecs_to_jiffies(3500); - T_slow[1] = msecs_to_jiffies(1500); - T_fast[0] = msecs_to_jiffies(8000); - T_fast[1] = msecs_to_jiffies(3000); + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ /* * Thresholds that determine the switch between speed classes diff --git a/block/bfq-sched.c b/block/bfq-sched.c index f8960a4e9..45d63d3ff 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -327,10 +327,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) static void bfq_update_active_node(struct rb_node *node) { struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); entity->min_start = entity->start; bfq_update_min(entity, node->rb_right); bfq_update_min(entity, node->rb_left); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#endif + } } /** @@ -1127,7 +1143,23 @@ static void bfq_update_vtime(struct bfq_service_tree *st) entry = rb_entry(node, struct bfq_entity, rb_node); if (bfq_gt(entry->min_start, st->vtime)) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entry); st->vtime = entry->min_start; + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entry, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_vtime: new vtime %llu %p", + ((st->vtime>>10)*1000)>>12, st); + } +#endif bfq_forget_idle(st); } } diff --git a/block/bfq.h b/block/bfq.h index 00142fcce..ea1e7d852 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -1,5 +1,5 @@ /* - * BFQ-v8r3 for 4.8.0: data structures and common functions prototypes. + * BFQ-v8r4 for 4.8.0: data structures and common functions prototypes. * * Based on ideas and code from CFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> @@ -298,6 +298,10 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ }; @@ -352,6 +356,13 @@ struct bfq_io_cq { * with another cooperating queue. */ bool was_in_burst_list; + + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; }; enum bfq_device_speed { @@ -431,14 +442,32 @@ struct bfq_data { /* on-disk position of the last served request */ sector_t last_position; + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + /* beginning of the last budget */ ktime_t last_budget_start; /* beginning of the last idle slice */ ktime_t last_idling_start; - /* number of samples used to calculate @peak_rate */ + + /* number of samples in current observation interval */ int peak_rate_samples; - /* peak transfer rate observed for a budget */ - u64 peak_rate; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; @@ -457,7 +486,7 @@ struct bfq_data { /* maximum allowed backward seek */ unsigned int bfq_back_max; /* maximum idling time */ - u64 bfq_slice_idle; + u32 bfq_slice_idle; /* last time CLASS_IDLE was served */ u64 bfq_class_idle_last_service; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fdcd5999d..f336dcbed 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -4002,7 +4009,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 08b3ac689..f83de99d7 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -368,8 +368,6 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); - - return tx; } else { struct page *p_src = P(blocks, disks); struct page *q_src = Q(blocks, disks); @@ -424,9 +422,11 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, submit->cb_param = cb_param_orig; submit->flags = flags_orig; async_tx_sync_epilog(submit); - - return NULL; + tx = NULL; } + dmaengine_unmap_put(unmap); + + return tx; } EXPORT_SYMBOL_GPL(async_syndrome_val); diff --git a/crypto/ghash-generic.c b/crypto/ghash-generic.c index bac70995e..12ad3e3a8 100644 --- a/crypto/ghash-generic.c +++ b/crypto/ghash-generic.c @@ -14,24 +14,13 @@ #include <crypto/algapi.h> #include <crypto/gf128mul.h> +#include <crypto/ghash.h> #include <crypto/internal/hash.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 - -struct ghash_ctx { - struct gf128mul_4k *gf128; -}; - -struct ghash_desc_ctx { - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; -}; - static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index e1d5ea6d5..2accf7845 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2689,6 +2689,9 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) dev_dbg(dev, "%s: event: %d\n", __func__, event); + if (event != NFIT_NOTIFY_UPDATE) + return; + device_lock(dev); if (!dev->driver) { /* dev->driver may be null if we're being removed */ diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index e894ded24..51d23f130 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -78,6 +78,10 @@ enum { NFIT_ARS_TIMEOUT = 90, }; +enum nfit_root_notifiers { + NFIT_NOTIFY_UPDATE = 0x80, +}; + struct nfit_spa { struct list_head list; struct nd_region *nd_region; diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index d799662f1..261420ddf 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c @@ -334,7 +334,7 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) return; } - unmap_kernel_range((unsigned long)cpu_addr, size); + unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } #endif diff --git a/drivers/clk/mvebu/cp110-system-controller.c b/drivers/clk/mvebu/cp110-system-controller.c index 7fa42d6b2..f2303da7f 100644 --- a/drivers/clk/mvebu/cp110-system-controller.c +++ b/drivers/clk/mvebu/cp110-system-controller.c @@ -81,13 +81,6 @@ enum { #define CP110_GATE_EIP150 25 #define CP110_GATE_EIP197 26 -static struct clk *cp110_clks[CP110_CLK_NUM]; - -static struct clk_onecell_data cp110_clk_data = { - .clks = cp110_clks, - .clk_num = CP110_CLK_NUM, -}; - struct cp110_gate_clk { struct clk_hw hw; struct regmap *regmap; @@ -142,6 +135,8 @@ static struct clk *cp110_register_gate(const char *name, if (!gate) return ERR_PTR(-ENOMEM); + memset(&init, 0, sizeof(init)); + init.name = name; init.ops = &cp110_gate_ops; init.parent_names = &parent_name; @@ -194,7 +189,8 @@ static int cp110_syscon_clk_probe(struct platform_device *pdev) struct regmap *regmap; struct device_node *np = pdev->dev.of_node; const char *ppv2_name, *apll_name, *core_name, *eip_name, *nand_name; - struct clk *clk; + struct clk_onecell_data *cp110_clk_data; + struct clk *clk, **cp110_clks; u32 nand_clk_ctrl; int i, ret; @@ -207,6 +203,20 @@ static int cp110_syscon_clk_probe(struct platform_device *pdev) if (ret) return ret; + cp110_clks = devm_kcalloc(&pdev->dev, sizeof(struct clk *), + CP110_CLK_NUM, GFP_KERNEL); + if (!cp110_clks) + return -ENOMEM; + + cp110_clk_data = devm_kzalloc(&pdev->dev, + sizeof(*cp110_clk_data), + GFP_KERNEL); + if (!cp110_clk_data) + return -ENOMEM; + + cp110_clk_data->clks = cp110_clks; + cp110_clk_data->clk_num = CP110_CLK_NUM; + /* Register the APLL which is the root of the clk tree */ of_property_read_string_index(np, "core-clock-output-names", CP110_CORE_APLL, &apll_name); @@ -334,10 +344,12 @@ static int cp110_syscon_clk_probe(struct platform_device *pdev) cp110_clks[CP110_MAX_CORE_CLOCKS + i] = clk; } - ret = of_clk_add_provider(np, cp110_of_clk_get, &cp110_clk_data); + ret = of_clk_add_provider(np, cp110_of_clk_get, cp110_clk_data); if (ret) goto fail_clk_add; + platform_set_drvdata(pdev, cp110_clks); + return 0; fail_clk_add: @@ -364,6 +376,7 @@ fail0: static int cp110_syscon_clk_remove(struct platform_device *pdev) { + struct clk **cp110_clks = platform_get_drvdata(pdev); int i; of_clk_del_provider(pdev->dev.of_node); diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c index 6c999cb01..27a94a119 100644 --- a/drivers/crypto/vmx/ghash.c +++ b/drivers/crypto/vmx/ghash.c @@ -26,16 +26,13 @@ #include <linux/hardirq.h> #include <asm/switch_to.h> #include <crypto/aes.h> +#include <crypto/ghash.h> #include <crypto/scatterwalk.h> #include <crypto/internal/hash.h> #include <crypto/b128ops.h> #define IN_INTERRUPT in_interrupt() -#define GHASH_BLOCK_SIZE (16) -#define GHASH_DIGEST_SIZE (16) -#define GHASH_KEY_LEN (16) - void gcm_init_p8(u128 htable[16], const u64 Xi[2]); void gcm_gmult_p8(u64 Xi[2], const u128 htable[16]); void gcm_ghash_p8(u64 Xi[2], const u128 htable[16], @@ -55,16 +52,11 @@ struct p8_ghash_desc_ctx { static int p8_ghash_init_tfm(struct crypto_tfm *tfm) { - const char *alg; + const char *alg = "ghash-generic"; struct crypto_shash *fallback; struct crypto_shash *shash_tfm = __crypto_shash_cast(tfm); struct p8_ghash_ctx *ctx = crypto_tfm_ctx(tfm); - if (!(alg = crypto_tfm_alg_name(tfm))) { - printk(KERN_ERR "Failed to get algorithm name.\n"); - return -ENOENT; - } - fallback = crypto_alloc_shash(alg, 0, CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(fallback)) { printk(KERN_ERR @@ -78,10 +70,18 @@ static int p8_ghash_init_tfm(struct crypto_tfm *tfm) crypto_shash_set_flags(fallback, crypto_shash_get_flags((struct crypto_shash *) tfm)); - ctx->fallback = fallback; - shash_tfm->descsize = sizeof(struct p8_ghash_desc_ctx) - + crypto_shash_descsize(fallback); + /* Check if the descsize defined in the algorithm is still enough. */ + if (shash_tfm->descsize < sizeof(struct p8_ghash_desc_ctx) + + crypto_shash_descsize(fallback)) { + printk(KERN_ERR + "Desc size of the fallback implementation (%s) does not match the expected value: %lu vs %u\n", + alg, + shash_tfm->descsize - sizeof(struct p8_ghash_desc_ctx), + crypto_shash_descsize(fallback)); + return -EINVAL; + } + ctx->fallback = fallback; return 0; } @@ -113,7 +113,7 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key, { struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm)); - if (keylen != GHASH_KEY_LEN) + if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; preempt_disable(); @@ -211,7 +211,8 @@ struct shash_alg p8_ghash_alg = { .update = p8_ghash_update, .final = p8_ghash_final, .setkey = p8_ghash_setkey, - .descsize = sizeof(struct p8_ghash_desc_ctx), + .descsize = sizeof(struct p8_ghash_desc_ctx) + + sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "p8_ghash", diff --git a/drivers/gpu/drm/virtio/virtgpu_drm_bus.c b/drivers/gpu/drm/virtio/virtgpu_drm_bus.c index 7f0e93f87..88a39165e 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drm_bus.c +++ b/drivers/gpu/drm/virtio/virtgpu_drm_bus.c @@ -27,6 +27,16 @@ #include "virtgpu_drv.h" +int drm_virtio_set_busid(struct drm_device *dev, struct drm_master *master) +{ + struct pci_dev *pdev = dev->pdev; + + if (pdev) { + return drm_pci_set_busid(dev, master); + } + return 0; +} + static void virtio_pci_kick_out_firmware_fb(struct pci_dev *pci_dev) { struct apertures_struct *ap; diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c index c13f70cfc..5820b7020 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.c +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c @@ -117,6 +117,7 @@ static const struct file_operations virtio_gpu_driver_fops = { static struct drm_driver driver = { .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_PRIME | DRIVER_RENDER | DRIVER_ATOMIC, + .set_busid = drm_virtio_set_busid, .load = virtio_gpu_driver_load, .unload = virtio_gpu_driver_unload, .open = virtio_gpu_driver_open, diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index b18ef3111..acf556a35 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -49,6 +49,7 @@ #define DRIVER_PATCHLEVEL 1 /* virtgpu_drm_bus.c */ +int drm_virtio_set_busid(struct drm_device *dev, struct drm_master *master); int drm_virtio_init(struct drm_driver *driver, struct virtio_device *vdev); struct virtio_gpu_object { diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 5da190e60..bcf76c337 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -932,8 +932,10 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, return; queue_ack: - this_cpu_inc(*ibp->rvp.rc_qacks); spin_lock_irqsave(&qp->s_lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto unlock; + this_cpu_inc(*ibp->rvp.rc_qacks); qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; qp->s_nak_state = qp->r_nak_state; qp->s_ack_psn = qp->r_ack_psn; @@ -942,6 +944,7 @@ queue_ack: /* Schedule the send tasklet. */ hfi1_schedule_send(qp); +unlock: spin_unlock_irqrestore(&qp->s_lock, flags); } diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c index a039a5df6..fd9271bc1 100644 --- a/drivers/misc/mei/amthif.c +++ b/drivers/misc/mei/amthif.c @@ -67,8 +67,12 @@ int mei_amthif_host_init(struct mei_device *dev, struct mei_me_client *me_cl) struct mei_cl *cl = &dev->iamthif_cl; int ret; - if (mei_cl_is_connected(cl)) - return 0; + mutex_lock(&dev->device_lock); + + if (mei_cl_is_connected(cl)) { + ret = 0; + goto out; + } dev->iamthif_state = MEI_IAMTHIF_IDLE; @@ -77,11 +81,13 @@ int mei_amthif_host_init(struct mei_device *dev, struct mei_me_client *me_cl) ret = mei_cl_link(cl); if (ret < 0) { dev_err(dev->dev, "amthif: failed cl_link %d\n", ret); - return ret; + goto out; } ret = mei_cl_connect(cl, me_cl, NULL); +out: + mutex_unlock(&dev->device_lock); return ret; } diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c index 1f33fea92..e094df3cf 100644 --- a/drivers/misc/mei/bus.c +++ b/drivers/misc/mei/bus.c @@ -983,12 +983,10 @@ void mei_cl_bus_rescan_work(struct work_struct *work) container_of(work, struct mei_device, bus_rescan_work); struct mei_me_client *me_cl; - mutex_lock(&bus->device_lock); me_cl = mei_me_cl_by_uuid(bus, &mei_amthif_guid); if (me_cl) mei_amthif_host_init(bus, me_cl); mei_me_cl_put(me_cl); - mutex_unlock(&bus->device_lock); mei_cl_bus_rescan(bus); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index d0b3a1bb8..dad15b6c6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11360,6 +11360,12 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev, dev_info(&pdev->dev, "%s: error %d\n", __func__, error); + if (!pf) { + dev_info(&pdev->dev, + "Cannot recover - error happened during device probe\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + /* shutdown all operations */ if (!test_bit(__I40E_SUSPENDED, &pf->state)) { rtnl_lock(); diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c index 6808db433..ec3a64e5d 100644 --- a/drivers/net/wireless/ath/carl9170/debug.c +++ b/drivers/net/wireless/ath/carl9170/debug.c @@ -75,7 +75,8 @@ static ssize_t carl9170_debugfs_read(struct file *file, char __user *userbuf, if (!ar) return -ENODEV; - dfops = container_of(file->f_op, struct carl9170_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct carl9170_debugfs_fops, fops); if (!dfops->read) return -ENOSYS; @@ -127,7 +128,8 @@ static ssize_t carl9170_debugfs_write(struct file *file, if (!ar) return -ENODEV; - dfops = container_of(file->f_op, struct carl9170_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct carl9170_debugfs_fops, fops); if (!dfops->write) return -ENOSYS; diff --git a/drivers/net/wireless/broadcom/b43/debugfs.c b/drivers/net/wireless/broadcom/b43/debugfs.c index b4bcd94af..77046384d 100644 --- a/drivers/net/wireless/broadcom/b43/debugfs.c +++ b/drivers/net/wireless/broadcom/b43/debugfs.c @@ -524,7 +524,8 @@ static ssize_t b43_debugfs_read(struct file *file, char __user *userbuf, goto out_unlock; } - dfops = container_of(file->f_op, struct b43_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct b43_debugfs_fops, fops); if (!dfops->read) { err = -ENOSYS; goto out_unlock; @@ -585,7 +586,8 @@ static ssize_t b43_debugfs_write(struct file *file, goto out_unlock; } - dfops = container_of(file->f_op, struct b43_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct b43_debugfs_fops, fops); if (!dfops->write) { err = -ENOSYS; goto out_unlock; diff --git a/drivers/net/wireless/broadcom/b43legacy/debugfs.c b/drivers/net/wireless/broadcom/b43legacy/debugfs.c index 090910ea2..82ef56ed7 100644 --- a/drivers/net/wireless/broadcom/b43legacy/debugfs.c +++ b/drivers/net/wireless/broadcom/b43legacy/debugfs.c @@ -221,7 +221,8 @@ static ssize_t b43legacy_debugfs_read(struct file *file, char __user *userbuf, goto out_unlock; } - dfops = container_of(file->f_op, struct b43legacy_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct b43legacy_debugfs_fops, fops); if (!dfops->read) { err = -ENOSYS; goto out_unlock; @@ -287,7 +288,8 @@ static ssize_t b43legacy_debugfs_write(struct file *file, goto out_unlock; } - dfops = container_of(file->f_op, struct b43legacy_debugfs_fops, fops); + dfops = container_of(debugfs_real_fops(file), + struct b43legacy_debugfs_fops, fops); if (!dfops->write) { err = -ENOSYS; goto out_unlock; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index b8aec5e5e..abaf003a5 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -2533,7 +2533,7 @@ static void brcmf_fill_bss_param(struct brcmf_if *ifp, struct station_info *si) WL_BSS_INFO_MAX); if (err) { brcmf_err("Failed to get bss info (%d)\n", err); - return; + goto out_kfree; } si->filled |= BIT(NL80211_STA_INFO_BSS_PARAM); si->bss_param.beacon_interval = le16_to_cpu(buf->bss_le.beacon_period); @@ -2545,6 +2545,9 @@ static void brcmf_fill_bss_param(struct brcmf_if *ifp, struct station_info *si) si->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (capability & WLAN_CAPABILITY_SHORT_SLOT_TIME) si->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; + +out_kfree: + kfree(buf); } static s32 @@ -3884,11 +3887,11 @@ brcmf_cfg80211_del_pmksa(struct wiphy *wiphy, struct net_device *ndev, if (!check_vif_up(ifp->vif)) return -EIO; - brcmf_dbg(CONN, "del_pmksa - PMK bssid = %pM\n", &pmksa->bssid); + brcmf_dbg(CONN, "del_pmksa - PMK bssid = %pM\n", pmksa->bssid); npmk = le32_to_cpu(cfg->pmk_list.npmk); for (i = 0; i < npmk; i++) - if (!memcmp(&pmksa->bssid, &pmk[i].bssid, ETH_ALEN)) + if (!memcmp(pmksa->bssid, pmk[i].bssid, ETH_ALEN)) break; if ((npmk > 0) && (i < npmk)) { diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c index 7e269f9aa..63664442e 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c @@ -234,13 +234,20 @@ static void brcmf_flowring_block(struct brcmf_flowring *flow, u16 flowid, void brcmf_flowring_delete(struct brcmf_flowring *flow, u16 flowid) { + struct brcmf_bus *bus_if = dev_get_drvdata(flow->dev); struct brcmf_flowring_ring *ring; + struct brcmf_if *ifp; u16 hash_idx; + u8 ifidx; struct sk_buff *skb; ring = flow->rings[flowid]; if (!ring) return; + + ifidx = brcmf_flowring_ifidx_get(flow, flowid); + ifp = brcmf_get_ifp(bus_if->drvr, ifidx); + brcmf_flowring_block(flow, flowid, false); hash_idx = ring->hash_id; flow->hash[hash_idx].ifidx = BRCMF_FLOWRING_INVALID_IFIDX; @@ -249,7 +256,7 @@ void brcmf_flowring_delete(struct brcmf_flowring *flow, u16 flowid) skb = skb_dequeue(&ring->skblist); while (skb) { - brcmu_pkt_buf_free_skb(skb); + brcmf_txfinalize(ifp, skb, false); skb = skb_dequeue(&ring->skblist); } diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 764049896..3d53d636b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -2388,15 +2388,23 @@ static int arcmsr_iop_message_xfer(struct AdapterControlBlock *acb, } case ARCMSR_MESSAGE_WRITE_WQBUFFER: { unsigned char *ver_addr; - int32_t user_len, cnt2end; + uint32_t user_len; + int32_t cnt2end; uint8_t *pQbuffer, *ptmpuserbuffer; + + user_len = pcmdmessagefld->cmdmessage.Length; + if (user_len > ARCMSR_API_DATA_BUFLEN) { + retvalue = ARCMSR_MESSAGE_FAIL; + goto message_out; + } + ver_addr = kmalloc(ARCMSR_API_DATA_BUFLEN, GFP_ATOMIC); if (!ver_addr) { retvalue = ARCMSR_MESSAGE_FAIL; goto message_out; } ptmpuserbuffer = ver_addr; - user_len = pcmdmessagefld->cmdmessage.Length; + memcpy(ptmpuserbuffer, pcmdmessagefld->messagedatabuffer, user_len); spin_lock_irqsave(&acb->wqbuffer_lock, flags); diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index ab67ec4b6..79c9860a1 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -717,7 +717,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) spin_lock_irqsave(vhost->host->host_lock, flags); vhost->state = IBMVFC_NO_CRQ; vhost->logged_in = 0; - ibmvfc_set_host_action(vhost, IBMVFC_HOST_ACTION_NONE); /* Clean out the queue */ memset(crq->msgs, 0, PAGE_SIZE); diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index e19969614..b022f5a01 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -462,7 +462,7 @@ static int dw8250_probe(struct platform_device *pdev) } data->pclk = devm_clk_get(&pdev->dev, "apb_pclk"); - if (IS_ERR(data->clk) && PTR_ERR(data->clk) == -EPROBE_DEFER) { + if (IS_ERR(data->pclk) && PTR_ERR(data->pclk) == -EPROBE_DEFER) { err = -EPROBE_DEFER; goto err_clk; } diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index bdfa659b9..858a54633 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1414,12 +1414,8 @@ static void __do_stop_tx_rs485(struct uart_8250_port *p) if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) { serial8250_clear_fifos(p); - serial8250_rpm_get(p); - p->ier |= UART_IER_RLSI | UART_IER_RDI; serial_port_out(&p->port, UART_IER, p->ier); - - serial8250_rpm_put(p); } } @@ -1429,6 +1425,7 @@ static void serial8250_em485_handle_stop_tx(unsigned long arg) struct uart_8250_em485 *em485 = p->em485; unsigned long flags; + serial8250_rpm_get(p); spin_lock_irqsave(&p->port.lock, flags); if (em485 && em485->active_timer == &em485->stop_tx_timer) { @@ -1436,6 +1433,7 @@ static void serial8250_em485_handle_stop_tx(unsigned long arg) em485->active_timer = NULL; } spin_unlock_irqrestore(&p->port.lock, flags); + serial8250_rpm_put(p); } static void __stop_tx_rs485(struct uart_8250_port *p) @@ -1475,7 +1473,7 @@ static inline void __stop_tx(struct uart_8250_port *p) unsigned char lsr = serial_in(p, UART_LSR); /* * To provide required timeing and allow FIFO transfer, - * __stop_tx_rs485 must be called only when both FIFO and + * __stop_tx_rs485() must be called only when both FIFO and * shift register are empty. It is for device driver to enable * interrupt on TEMT. */ @@ -1484,9 +1482,10 @@ static inline void __stop_tx(struct uart_8250_port *p) del_timer(&em485->start_tx_timer); em485->active_timer = NULL; + + __stop_tx_rs485(p); } __do_stop_tx(p); - __stop_tx_rs485(p); } static void serial8250_stop_tx(struct uart_port *port) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 2eaa18dde..8bbde52db 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -1929,6 +1929,9 @@ static void atmel_shutdown(struct uart_port *port) { struct atmel_uart_port *atmel_port = to_atmel_uart_port(port); + /* Disable modem control lines interrupts */ + atmel_disable_ms(port); + /* Disable interrupts at device level */ atmel_uart_writel(port, ATMEL_US_IDR, -1); @@ -1979,8 +1982,6 @@ static void atmel_shutdown(struct uart_port *port) */ free_irq(port->irq, port); - atmel_port->ms_irq_enabled = false; - atmel_flush_buffer(port); } diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 0df2b1c09..615c0279a 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -740,12 +740,13 @@ static unsigned int imx_get_hwmctrl(struct imx_port *sport) { unsigned int tmp = TIOCM_DSR; unsigned usr1 = readl(sport->port.membase + USR1); + unsigned usr2 = readl(sport->port.membase + USR2); if (usr1 & USR1_RTSS) tmp |= TIOCM_CTS; /* in DCE mode DCDIN is always 0 */ - if (!(usr1 & USR2_DCDIN)) + if (!(usr2 & USR2_DCDIN)) tmp |= TIOCM_CAR; if (sport->dte_mode) @@ -202,6 +202,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 431fd7ee3..e44271dfc 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); + wq->uid = current_real_cred()->uid; + wq->gid = current_real_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 029db6e11..60a850ee8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 33fe03551..791e47ce9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -251,7 +251,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 54bc8c7c6..3dede6d53 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2566,6 +2566,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + int clear_free_space_tree = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -3129,6 +3130,26 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = 1; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = 1; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + close_ctree(tree_root); + return ret; + } + } + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); @@ -3166,18 +3187,6 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(tree_root); - return ret; - } - } - if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 44fe66b53..c3ec30dea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5524,17 +5524,45 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -/* - * The extent buffer bitmap operations are done with byte granularity because - * bitmap items are not guaranteed to be aligned to a word and therefore a - * single word in a bitmap may straddle two pages in the extent buffer. - */ -#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) -#define BITMAP_FIRST_BYTE_MASK(start) \ - ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) -#define BITMAP_LAST_BYTE_MASK(nbits) \ - (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +void le_bitmap_set(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~(u8)0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + +void le_bitmap_clear(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~(u8)0; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + *p &= ~mask_to_clear; + } +} /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the @@ -5578,7 +5606,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long nr) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; @@ -5600,13 +5628,13 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5617,7 +5645,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, kaddr[offset] |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0U; + mask_to_set = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; @@ -5642,13 +5670,13 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5659,7 +5687,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, kaddr[offset] &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0U; + mask_to_clear = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 28cd88fcc..1cf4e4226 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -59,6 +59,28 @@ */ #define EXTENT_PAGE_PRIVATE 1 +/* + * The extent buffer bitmap operations are done with byte granularity instead of + * word granularity for two reasons: + * 1. The bitmaps must be little-endian on disk. + * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +static inline int le_test_bit(int nr, const u8 *addr) +{ + return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); +} + +extern void le_bitmap_set(u8 *map, unsigned int start, int len); +extern void le_bitmap_clear(u8 *map, unsigned int start, int len); + struct extent_state; struct btrfs_root; struct btrfs_io_bio; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 87e7e3d3e..ea605ffd0 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -151,7 +151,7 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static unsigned long *alloc_bitmap(u32 bitmap_size) +static u8 *alloc_bitmap(u32 bitmap_size) { void *mem; @@ -180,8 +180,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; - char *bitmap_cursor; + u8 *bitmap, *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -231,7 +230,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, block_group->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, block_group->sectorsize); - bitmap_set(bitmap, first, last - first); + le_bitmap_set(bitmap, first, last - first); extent_count++; nr++; @@ -269,7 +268,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = (char *)bitmap; + bitmap_cursor = bitmap; bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -318,7 +317,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; + u8 *bitmap; u64 start, end; /* Initialize to silence GCC. */ u64 extent_start = 0; @@ -362,7 +361,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - char *bitmap_cursor; + u8 *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -372,7 +371,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, block_group->sectorsize * BITS_PER_BYTE); - bitmap_cursor = ((char *)bitmap) + bitmap_pos; + bitmap_cursor = bitmap + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, block_group->sectorsize); @@ -409,7 +408,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, offset = start; bitnr = 0; while (offset < end) { - bit = !!test_bit(bitnr, bitmap); + bit = !!le_test_bit(bitnr, bitmap); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -1183,6 +1182,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->creating_free_space_tree = 0; ret = btrfs_commit_transaction(trans, tree_root); @@ -1251,6 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index ce5f345d7..e7f16a77a 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) struct cachefiles_object *object; struct cachefiles_cache *cache; const struct cred *saved_cred; + struct inode *inode; + blkcnt_t i_blocks = 0; ASSERT(_object); @@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object) _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); + inode = d_backing_inode(object->dentry); + if (inode) + i_blocks = inode->i_blocks; + cachefiles_begin_secure(cache, &saved_cred); cachefiles_delete_object(cache, object); cachefiles_end_secure(cache, saved_cred); @@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive(cache, object, i_blocks); dput(object->dentry); object->dentry = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2fcde1a34..cd1effee8 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); * namei.c */ extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object); + struct cachefiles_object *object, + blkcnt_t i_blocks); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3f7c2cd41..c6ee4b5fb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -261,10 +261,9 @@ requeue: * Mark an object as being inactive. */ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object) + struct cachefiles_object *object, + blkcnt_t i_blocks) { - blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks; - write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -707,7 +706,8 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive( + cache, object, d_backing_inode(object->dentry)->i_blocks); release_dentry: dput(object->dentry); object->dentry = NULL; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 592059f88..309f4e9b2 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish); #define F_DENTRY(filp) ((filp)->f_path.dentry) -#define REAL_FOPS_DEREF(dentry) \ - ((const struct file_operations *)(dentry)->d_fsdata) - static int open_proxy_open(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); @@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \ { \ const struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops = \ - REAL_FOPS_DEREF(dentry); \ + debugfs_real_fops(filp); \ int srcu_idx; \ ret_type r; \ \ @@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); int srcu_idx; unsigned int r = 0; @@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp, static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; @@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 963016c8f..609998de5 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1656,16 +1656,12 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; foreach_conn(stop_conn); + clean_writequeues(); + foreach_conn(free_conn); mutex_unlock(&connections_lock); work_stop(); - mutex_lock(&connections_lock); - clean_writequeues(); - - foreach_conn(free_conn); - - mutex_unlock(&connections_lock); kmem_cache_destroy(con_cache); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d7ccb7f51..7f69347bd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5734,6 +5734,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6ea25a19..f4cdc647e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -647,11 +647,19 @@ found: /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + ext4_lblk_t i; + + for (i = 0; i < map->m_len; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + map->m_pblk + i); + } ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1649,6 +1657,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } @@ -3890,7 +3900,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3919,7 +3929,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) @@ -4814,14 +4824,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a920c5d29..6fc14def0 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 34c0142ca..7e2f8c3c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2044,33 +2044,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 4d83d9e05..04a7850a0 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -65,13 +65,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; + paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - paddr = pstr.name; - /* Null-terminate the name */ if (res <= pstr.len) paddr[res] = '\0'; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c47b7780c..4ff9251e9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1702,14 +1702,46 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + int kill; + + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); + /* + * ia_mode calculation may have used stale i_mode. Refresh and + * recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } + if (!attr->ia_valid) + return 0; + + ret = fuse_do_setattr(inode, attr, file); + if (!ret) { + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); + } + return ret; } static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, @@ -1801,6 +1833,23 @@ static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode, return ret; } +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); @@ -1836,6 +1885,8 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) ret = fuse_simple_request(fc, &args); if (!ret && !size) ret = outarg.size; + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { fc->no_listxattr = 1; ret = -EOPNOTSUPP; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b5bc3e249..3d8246a9f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -159,6 +159,7 @@ static void wait_transaction_locked(journal_t *journal) read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); + jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } @@ -182,8 +183,6 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; - jbd2_might_wait_for_commit(journal); - /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -214,6 +213,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); @@ -238,6 +238,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) __jbd2_log_wait_for_space(journal); @@ -255,6 +256,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a4a85a68..74d5ddd26 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s, static int reiserfs_quota_on_mount(struct super_block *, int); #endif -/* look for uncompleted unlinks and truncates and complete them */ +/* + * Look for uncompleted unlinks and truncates and complete them + * + * Called with superblock write locked. If quotas are enabled, we have to + * release/retake lest we call dquot_quota_on_mount(), proceed to + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per + * cpu worklets to complete flush_async_commits() that in turn wait for the + * superblock write lock. + */ static int finish_unfinished(struct super_block *s) { INITIALIZE_PATH(path); @@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s) quota_enabled[i] = 0; continue; } + reiserfs_write_unlock(s); ret = reiserfs_quota_on_mount(s, i); + reiserfs_write_lock(s); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " diff --git a/fs/utimes.c b/fs/utimes.c index 794f5f5b1..ba54b9e64 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -87,21 +87,7 @@ static int utimes_common(struct path *path, struct timespec *times) */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { - /* - * If times is NULL (or both times are UTIME_NOW), - * then we need to check permissions, because - * inode_change_ok() won't do it. - */ - error = -EPERM; - if (IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - - error = -EACCES; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); @@ -113,7 +99,6 @@ retry_deleg: goto retry_deleg; } -mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; diff --git a/include/crypto/ghash.h b/include/crypto/ghash.h new file mode 100644 index 000000000..2a61c9bba --- /dev/null +++ b/include/crypto/ghash.h @@ -0,0 +1,23 @@ +/* + * Common values for GHASH algorithms + */ + +#ifndef __CRYPTO_GHASH_H__ +#define __CRYPTO_GHASH_H__ + +#include <linux/types.h> +#include <crypto/gf128mul.h> + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_ctx { + struct gf128mul_4k *gf128; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +#endif diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 1438e2322..4d3f0d1ae 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,6 +45,23 @@ extern struct dentry *arch_debugfs_dir; extern struct srcu_struct debugfs_srcu; +/** + * debugfs_real_fops - getter for the real file operation + * @filp: a pointer to a struct file + * + * Must only be called under the protection established by + * debugfs_use_file_start(). + */ +static inline const struct file_operations *debugfs_real_fops(struct file *filp) + __must_hold(&debugfs_srcu) +{ + /* + * Neither the pointer to the struct file_operations, nor its + * contents ever change -- srcu_dereference() is not needed here. + */ + return filp->f_path.dentry->d_fsdata; +} + #if defined(CONFIG_DEBUG_FS) struct dentry *debugfs_create_file(const char *name, umode_t mode, diff --git a/include/linux/fs.h b/include/linux/fs.h index 3b3703885..6f180afdd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -227,6 +227,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +#define ATTR_TOUCH (1 << 17) /* * Whiteout is represented by a char device. The following constants define the diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 4c45105de..52b97db93 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -280,9 +280,9 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); -struct radix_tree_node *radix_tree_replace_clear_tags( - struct radix_tree_root *root, - unsigned long index, void *entry); +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); diff --git a/include/linux/sem.h b/include/linux/sem.h index 976ce3a19..d0efd6e6c 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,6 +21,7 @@ struct sem_array { struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ + bool complex_mode; /* no parallel simple ops */ }; #ifdef CONFIG_SYSVIPC diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index ac5eacd30..db4c253f8 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -239,7 +239,17 @@ struct btrfs_ioctl_fs_info_args { * Used by: * struct btrfs_ioctl_feature_flags */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) +/* + * Older kernels (< 4.9) on big-endian systems produced broken free space tree + * bitmaps, and btrfs-progs also used to corrupt the free space tree (versions + * < 4.7.3). If this bit is clear, then the free space tree cannot be trusted. + * btrfs-progs can also intentionally clear this bit to ask the kernel to + * rebuild the free space tree, however this might not work on older kernels + * that do not know about this bit. If not sure, clear the cache manually on + * first mount when booting older kernel versions. + */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) @@ -162,14 +162,21 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); /* * Locking: + * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, - * sem_array.pending{_alter,_cont}, - * sem_array.sem_undo: global sem_lock() for read/write - * sem_undo.proc_next: only "current" is allowed to read/write that field. + * sem_array.complex_mode + * sem_array.pending{_alter,_const}, + * sem_array.sem_undo * + * b) global or semaphore sem_lock() for read/write: * sem_array.sem_base[i].pending_{const,alter}: - * global or semaphore sem_lock() for read/write + * sem_array.complex_mode (for read) + * + * c) special: + * sem_undo_list.list_proc: + * * undo_list->lock for write + * * rcu for read */ #define sc_semmsl sem_ctls[0] @@ -260,30 +267,61 @@ static void sem_rcu_free(struct rcu_head *head) } /* - * Wait until all currently ongoing simple ops have completed. + * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. - * New simple ops cannot start, because simple ops first check - * that sem_perm.lock is free. - * that a) sem_perm.lock is free and b) complex_count is 0. */ -static void sem_wait_array(struct sem_array *sma) +static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; - if (sma->complex_count) { - /* The thread that increased sma->complex_count waited on - * all sem->lock locks. Thus we don't need to wait again. - */ + if (sma->complex_mode) { + /* We are already in complex_mode. Nothing to do */ return; } + /* We need a full barrier after seting complex_mode: + * The write to complex_mode must be visible + * before we read the first sem->lock spinlock state. + */ + smp_store_mb(sma->complex_mode, true); + for (i = 0; i < sma->sem_nsems; i++) { sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } + /* + * spin_unlock_wait() is not a memory barriers, it is only a + * control barrier. The code must pair with spin_unlock(&sem->lock), + * thus just the control barrier is insufficient. + * + * smp_rmb() is sufficient, as writes cannot pass the control barrier. + */ + smp_rmb(); +} + +/* + * Try to leave the mode that disallows simple operations: + * Caller must own sem_perm.lock. + */ +static void complexmode_tryleave(struct sem_array *sma) +{ + if (sma->complex_count) { + /* Complex ops are sleeping. + * We must stay in complex mode + */ + return; + } + /* + * Immediately after setting complex_mode to false, + * a simple op can start. Thus: all memory writes + * performed by the current operation must be visible + * before we set complex_mode to false. + */ + smp_store_release(&sma->complex_mode, false); } +#define SEM_GLOBAL_LOCK (-1) /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. @@ -300,56 +338,42 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); - /* And wait until all simple ops that are processed - * right now have dropped their locks. - */ - sem_wait_array(sma); - return -1; + /* Prevent parallel simple ops */ + complexmode_enter(sma); + return SEM_GLOBAL_LOCK; } /* * Only one semaphore affected - try to optimize locking. - * The rules are: - * - optimized locking is possible if no complex operation - * is either enqueued or processed right now. - * - The test for enqueued complex ops is simple: - * sma->complex_count != 0 - * - Testing for complex ops that are processed right now is - * a bit more difficult. Complex ops acquire the full lock - * and first wait that the running simple ops have completed. - * (see above) - * Thus: If we own a simple lock and the global lock is free - * and complex_count is now 0, then it will stay 0 and - * thus just locking sem->lock is sufficient. + * Optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * + * Both facts are tracked by complex_mode. */ sem = sma->sem_base + sops->sem_num; - if (sma->complex_count == 0) { + /* + * Initial check for complex_mode. Just an optimization, + * no locking, no memory barrier. + */ + if (!sma->complex_mode) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); - /* Then check that the global lock is free */ - if (!spin_is_locked(&sma->sem_perm.lock)) { - /* - * We need a memory barrier with acquire semantics, - * otherwise we can race with another thread that does: - * complex_count++; - * spin_unlock(sem_perm.lock); - */ - smp_acquire__after_ctrl_dep(); + /* + * See 51d7d5205d33 + * ("powerpc: Add smp_mb() to arch_spin_is_locked()"): + * A full barrier is required: the write of sem->lock + * must be visible before the read is executed + */ + smp_mb(); - /* - * Now repeat the test of complex_count: - * It can't change anymore until we drop sem->lock. - * Thus: if is now 0, then it will stay 0. - */ - if (sma->complex_count == 0) { - /* fast path successful! */ - return sops->sem_num; - } + if (!smp_load_acquire(&sma->complex_mode)) { + /* fast path successful! */ + return sops->sem_num; } spin_unlock(&sem->lock); } @@ -369,15 +393,16 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Not a false alarm, thus complete the sequence for a * full lock. */ - sem_wait_array(sma); - return -1; + complexmode_enter(sma); + return SEM_GLOBAL_LOCK; } } static inline void sem_unlock(struct sem_array *sma, int locknum) { - if (locknum == -1) { + if (locknum == SEM_GLOBAL_LOCK) { unmerge_queues(sma); + complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; @@ -529,6 +554,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) } sma->complex_count = 0; + sma->complex_mode = true; /* dropped by sem_unlock below */ INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); @@ -2184,10 +2210,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object() directly (in sysvipc_find_ipc). - * In order to stay compatible with sem_lock(), we must wait until - * all simple semop() calls have left their critical regions. + * In order to stay compatible with sem_lock(), we must + * enter / leave complex_mode. */ - sem_wait_array(sma); + complexmode_enter(sma); sem_otime = get_semotime(sma); @@ -2204,6 +2230,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) sem_otime, sma->sem_ctime); + complexmode_tryleave(sma); + return 0; } #endif diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index 6a656ad4b..bd9b1f13f 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -135,7 +135,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.115 by Con Kolivas.\n"); } /* @@ -179,28 +179,9 @@ static inline int timeslice(void) return MS_TO_US(rr_interval); } -/* - * The global runqueue data that all CPUs work off. Contains either atomic - * variables and a cpu bitmap set atomically. - */ -struct global_rq { #ifdef CONFIG_SMP - atomic_t nr_running ____cacheline_aligned_in_smp; - atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; - atomic64_t nr_switches ____cacheline_aligned_in_smp; - atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */ -#else - atomic_t nr_running ____cacheline_aligned; - atomic_t nr_uninterruptible ____cacheline_aligned; - atomic64_t nr_switches ____cacheline_aligned; - atomic_t qnr ____cacheline_aligned; /* queued not running */ -#endif -#ifdef CONFIG_SMP - cpumask_t cpu_idle_map; -#endif -}; +static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -#ifdef CONFIG_SMP /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -232,13 +213,6 @@ static struct root_domain def_root_domain; #endif /* CONFIG_SMP */ -/* There can be only one */ -#ifdef CONFIG_SMP -static struct global_rq grq ____cacheline_aligned_in_smp; -#else -static struct global_rq grq ____cacheline_aligned; -#endif - static DEFINE_MUTEX(sched_hotcpu_mutex); /* cpus with isolated domains */ @@ -710,6 +684,12 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) next->on_cpu = 1; } +static inline void smp_sched_reschedule(int cpu) +{ + if (likely(cpu_online(cpu))) + smp_send_reschedule(cpu); +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -736,7 +716,7 @@ void resched_task(struct task_struct *p) } if (set_nr_and_not_polling(p)) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); } @@ -788,6 +768,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ if (unlikely(task_on_rq_migrating(prev))) { sched_info_dequeued(rq, prev); + rq->nr_running--; /* * We move the ownership of prev to the new cpu now. ttwu can't * activate prev to the wrong cpu since it has to grab this @@ -798,6 +779,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_lock(&prev->pi_lock); rq = __task_rq_lock(prev); + rq->nr_running++; /* Check that someone else hasn't already queued prev */ if (likely(!task_queued(prev))) { enqueue_task(rq, prev, 0); @@ -852,7 +834,7 @@ static inline int ms_longest_deadline_diff(void) static inline int rq_load(struct rq *rq) { - return rq->sl->entries + !rq_idle(rq); + return rq->nr_running; } static inline bool rq_local(struct rq *rq); @@ -999,26 +981,6 @@ static inline int task_timeslice(struct task_struct *p) return (rr_interval * task_prio_ratio(p) / 128); } -/* - * qnr is the "queued but not running" count which is the total number of - * tasks on the global runqueue list waiting for cpu time but not actually - * currently running on a cpu. - */ -static inline void inc_qnr(void) -{ - atomic_inc(&grq.qnr); -} - -static inline void dec_qnr(void) -{ - atomic_dec(&grq.qnr); -} - -static inline int queued_notrunning(void) -{ - return atomic_read(&grq.qnr); -} - #ifdef CONFIG_SMP /* Entered with rq locked */ static inline void resched_if_idle(struct rq *rq) @@ -1123,7 +1085,7 @@ static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) static inline void set_cpuidle_map(int cpu) { if (likely(cpu_online(cpu))) - atomic_set_cpu(cpu, &grq.cpu_idle_map); + atomic_set_cpu(cpu, &cpu_idle_map); } static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) @@ -1133,12 +1095,12 @@ static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) static inline void clear_cpuidle_map(int cpu) { - atomic_clear_cpu(cpu, &grq.cpu_idle_map); + atomic_clear_cpu(cpu, &cpu_idle_map); } static bool suitable_idle_cpus(struct task_struct *p) { - return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); + return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); } /* @@ -1165,7 +1127,7 @@ static void resched_curr(struct rq *rq) } if (set_nr_and_not_polling(rq->curr)) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); } @@ -1260,7 +1222,7 @@ static inline void resched_idle(struct rq *rq) return; } - smp_send_reschedule(rq->cpu); + smp_sched_reschedule(rq->cpu); } static struct rq *resched_best_idle(struct task_struct *p, int cpu) @@ -1269,7 +1231,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu) struct rq *rq; int best_cpu; - cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); + cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) @@ -1381,12 +1343,11 @@ static void activate_task(struct task_struct *p, struct rq *rq) p->prio = effective_prio(p); if (task_contributes_to_load(p)) - atomic_dec(&grq.nr_uninterruptible); + rq->nr_uninterruptible--; enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - atomic_inc(&grq.nr_running); - inc_qnr(); + rq->nr_running++; } /* @@ -1396,10 +1357,10 @@ static void activate_task(struct task_struct *p, struct rq *rq) static inline void deactivate_task(struct task_struct *p, struct rq *rq) { if (task_contributes_to_load(p)) - atomic_inc(&grq.nr_uninterruptible); + rq->nr_uninterruptible++; p->on_rq = 0; - atomic_dec(&grq.nr_running); + rq->nr_running--; sched_info_dequeued(rq, p); } @@ -1467,11 +1428,12 @@ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) dequeue_task(p_rq, p, DEQUEUE_SAVE); if (p_rq != rq) { + p_rq->nr_running--; sched_info_dequeued(p_rq, p); + rq->nr_running++; sched_info_queued(rq, p); } set_task_cpu(p, cpu); - dec_qnr(); } /* @@ -1484,7 +1446,6 @@ static inline void return_task(struct task_struct *p, struct rq *rq, if (deactivate) deactivate_task(p, rq); else { - inc_qnr(); #ifdef CONFIG_SMP /* * set_task_cpu was called on the running task that doesn't @@ -1641,7 +1602,7 @@ void kick_process(struct task_struct *p) preempt_disable(); cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -1806,7 +1767,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) #ifdef CONFIG_SMP if (p->sched_contributes_to_load) - atomic_dec(&grq.nr_uninterruptible); + rq->nr_uninterruptible--; #endif ttwu_activate(rq, p); @@ -1897,7 +1858,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); } @@ -1918,7 +1879,7 @@ void wake_up_if_idle(int cpu) } else { rq_lock_irqsave(rq, &flags); if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); /* Else cpu is not in idle, do nothing here */ rq_unlock_irqrestore(rq, &flags); } @@ -1980,7 +1941,7 @@ static inline int select_best_cpu(struct task_struct *p) rq = other_rq; } if (unlikely(!rq)) - return smp_processor_id(); + return task_cpu(p); return rq->cpu; } #else /* CONFIG_SMP */ @@ -2690,22 +2651,6 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. - */ -unsigned long nr_running(void) -{ - return atomic_read(&grq.nr_running); -} - -static unsigned long nr_uninterruptible(void) -{ - return atomic_read(&grq.nr_uninterruptible); -} - -/* * Check if only the current task is running on the cpu. * * Caution: this function does not check that the caller has disabled @@ -2729,9 +2674,31 @@ bool single_task_running(void) } EXPORT_SYMBOL(single_task_running); +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, total number of context switches performed since bootup. + */ unsigned long long nr_context_switches(void) { - return (unsigned long long)atomic64_read(&grq.nr_switches); + long long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_running(void) +{ + long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; } unsigned long nr_iowait(void) @@ -2752,7 +2719,14 @@ unsigned long nr_iowait_cpu(int cpu) unsigned long nr_active(void) { - return nr_running() + nr_uninterruptible(); + long i, sum = 0; + + for_each_online_cpu(i) { + sum += cpu_rq(i)->nr_running; + sum += cpu_rq(i)->nr_uninterruptible; + } + + return sum; } /* @@ -3662,8 +3636,6 @@ static inline void check_deadline(struct task_struct *p, struct rq *rq) time_slice_expired(p, rq); } -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - /* * Task selection with skiplists is a simple matter of picking off the first * task in the sorted list, an O(1) operation. The lookup is amortised O(1) @@ -3680,8 +3652,9 @@ static inline void check_deadline(struct task_struct *p, struct rq *rq) * runqueue or a runqueue with more tasks than the current one with a better * key/deadline. */ -static inline struct -task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) +#ifdef CONFIG_SMP +static inline struct task_struct +*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { struct task_struct *edt = idle; struct rq *locked = NULL; @@ -3759,6 +3732,19 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * return edt; } +#else /* CONFIG_SMP */ +static inline struct task_struct +*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) +{ + struct task_struct *edt; + + if (unlikely(!rq->sl->entries)) + return idle; + edt = rq->node.next[0]->value; + take_task(rq, cpu, edt); + return edt; +} +#endif /* CONFIG_SMP */ /* * Print scheduling while atomic bug: @@ -3840,13 +3826,9 @@ static void check_smt_siblings(struct rq *this_rq) rq = cpu_rq(other_cpu); if (rq_idle(rq)) continue; - if (unlikely(!rq->online)) - continue; p = rq->curr; - if (!smt_schedule(p, this_rq)) { - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } + if (!smt_schedule(p, this_rq)) + resched_curr(rq); } } @@ -3854,21 +3836,12 @@ static void wake_smt_siblings(struct rq *this_rq) { int other_cpu; - if (!queued_notrunning()) - return; - for_each_cpu(other_cpu, &this_rq->thread_mask) { struct rq *rq; rq = cpu_rq(other_cpu); - if (unlikely(!rq->online)) - continue; - if (rq_idle(rq)) { - struct task_struct *p = rq->curr; - - set_tsk_need_resched(p); - smp_send_reschedule(other_cpu); - } + if (rq_idle(rq)) + resched_idle(rq); } } #else @@ -4020,23 +3993,16 @@ static void __sched notrace __schedule(bool preempt) return_task(prev, rq, cpu, deactivate); } - if (unlikely(!queued_notrunning())) { - next = idle; - schedstat_inc(rq, sched_goidle); + next = earliest_deadline_task(rq, cpu, idle); + if (likely(next->prio != PRIO_LIMIT)) { + clear_cpuidle_map(cpu); + next->last_ran = niffies; + } else { set_cpuidle_map(cpu); update_load_avg(rq); - } else { - next = earliest_deadline_task(rq, cpu, idle); - if (likely(next->prio != PRIO_LIMIT)) - clear_cpuidle_map(cpu); - else { - set_cpuidle_map(cpu); - update_load_avg(rq); - } } set_rq_task(rq, next); - next->last_ran = niffies; if (likely(prev != next)) { /* @@ -4048,16 +4014,14 @@ static void __sched notrace __schedule(bool preempt) check_siblings(rq); else wake_siblings(rq); - atomic64_inc(&grq.nr_switches); + rq->nr_switches++; rq->curr = next; ++*switch_count; trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - } else { - check_siblings(rq); + } else rq_unlock_irq(rq); - } } static inline void sched_submit_work(struct task_struct *tsk) @@ -5618,7 +5582,7 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq = task_rq(p); @@ -5633,6 +5597,29 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) */ lockdep_assert_held(&rq->lock); } +} + +/* + * Calling do_set_cpus_allowed from outside the scheduler code may make the + * task not be able to run on its current CPU so we resched it here. + */ +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask); + if (needs_other_cpu(p, task_cpu(p))) { + set_task_cpu(p, valid_task_cpu(p)); + resched_task(p); + } +} + +/* + * For internal scheduler calls to do_set_cpus_allowed which will resched + * themselves if needed. + */ +static void _do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask); + /* __set_cpus_allowed_ptr will handle the reschedule in this variant */ if (needs_other_cpu(p, task_cpu(p))) set_task_cpu(p, valid_task_cpu(p)); } @@ -5830,7 +5817,7 @@ void wake_up_idle_cpu(int cpu) return; if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) - smp_send_reschedule(cpu); + smp_sched_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); } @@ -5890,7 +5877,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, queued = task_queued(p); - do_set_cpus_allowed(p, new_mask); + _do_set_cpus_allowed(p, new_mask); if (p->flags & PF_KTHREAD) { /* @@ -7476,7 +7463,7 @@ static const cpumask_t *thread_cpumask(int cpu) /* All this CPU's SMT siblings are idle */ static bool siblings_cpu_idle(struct rq *rq) { - return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map); + return cpumask_subset(&rq->thread_mask, &cpu_idle_map); } #endif #ifdef CONFIG_SCHED_MC @@ -7487,7 +7474,7 @@ static const cpumask_t *core_cpumask(int cpu) /* All this CPU's shared cache siblings are idle */ static bool cache_cpu_idle(struct rq *rq) { - return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map); + return cpumask_subset(&rq->core_mask, &cpu_idle_map); } #endif @@ -7668,15 +7655,11 @@ void __init sched_init(void) for (i = 1 ; i < NICE_WIDTH ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - atomic_set(&grq.nr_running, 0); - atomic_set(&grq.nr_uninterruptible, 0); - atomic64_set(&grq.nr_switches, 0); skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP init_defrootdomain(); - atomic_set(&grq.qnr, 0); - cpumask_clear(&grq.cpu_idle_map); + cpumask_clear(&cpu_idle_map); #else uprq = &per_cpu(runqueues, 0); #endif @@ -7690,6 +7673,7 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); + rq->nr_running = rq->nr_uninterruptible = rq->nr_switches = 0; skiplist_init(&rq->node); rq->sl = new_skiplist(&rq->node); raw_spin_lock_init(&rq->lock); diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index 4e3115dac..10a12b335 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -17,6 +17,9 @@ struct rq { struct task_struct *curr, *idle, *stop; struct mm_struct *prev_mm; + long nr_uninterruptible; + s64 nr_switches; + int nr_running; raw_spinlock_t lock; diff --git a/kernel/skip_list.c b/kernel/skip_list.c index 5c66067f2..d52508056 100644 --- a/kernel/skip_list.c +++ b/kernel/skip_list.c @@ -93,34 +93,9 @@ void skiplist_node_init(skiplist_node *node) memset(node, 0, sizeof(skiplist_node)); } -/* - * Returns a pseudo-random number based on the randseed value by masking out - * 0-15. As many levels are not required when only few values are on the list, - * we limit the height of the levels according to how many list entries there - * are in a cheap manner. The height of the levels may have been higher while - * there were more entries queued previously but as this code is used only by - * the scheduler, entries are short lived and will be torn down regularly. - * - * 00-03 entries - 1 level - * 04-07 entries - 2 levels - * 08-15 entries - 4 levels - * 15-31 entries - 7 levels - * 32+ entries - max(16) levels - */ -static inline unsigned int randomLevel(int entries, unsigned int randseed) +static inline unsigned int randomLevel(const long unsigned int randseed) { - unsigned int mask; - - if (entries > 15) - mask = 0x7; - else if (entries > 7) - mask = 0x3; - else if (entries > 3) - mask = 0x1; - else - return 0; - - return randseed & mask; + return find_first_bit(&randseed, MaxLevel); } void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) @@ -136,9 +111,8 @@ void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType va update[k] = p; } while (--k >= 0); - k = randomLevel(++l->entries, randseed); - if (k > MaxLevel) - k = MaxLevel; + ++l->entries; + k = randomLevel(randseed); if (k > l->level) { k = ++l->level; update[k] = l->header; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 91f0727e3..8e6d552c4 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1583,15 +1583,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -struct radix_tree_node *radix_tree_replace_clear_tags( - struct radix_tree_root *root, - unsigned long index, void *entry) +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(root, index, &node, &slot); - if (node) { unsigned int tag, offset = get_slot_offset(node, slot); for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) @@ -1600,9 +1595,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags( /* Clear root node tags */ root->gfp_mask &= __GFP_BITS_MASK; } - - radix_tree_replace_slot(slot, entry); - return node; } /** diff --git a/mm/filemap.c b/mm/filemap.c index 4a31bad51..ea47a7de5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -169,33 +169,35 @@ static int page_cache_tree_insert(struct address_space *mapping, static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - struct radix_tree_node *node; int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } - mapping->nrpages -= nr; - for (i = 0; i < nr; i++) { - node = radix_tree_replace_clear_tags(&mapping->page_tree, - page->index + i, shadow); + struct radix_tree_node *node; + void **slot; + + __radix_tree_lookup(&mapping->page_tree, page->index + i, + &node, &slot); + + radix_tree_clear_tags(&mapping->page_tree, node, slot); + if (!node) { VM_BUG_ON_PAGE(nr != 1, page); - return; + /* + * We need a node to properly account shadow + * entries. Don't plant any without. XXX + */ + shadow = NULL; } + radix_tree_replace_slot(slot, shadow); + + if (!node) + break; + workingset_node_pages_dec(node); if (shadow) workingset_node_shadows_inc(node); @@ -219,6 +221,18 @@ static void page_cache_tree_delete(struct address_space *mapping, &node->private_list); } } + + if (shadow) { + mapping->nrexceptional += nr; + /* + * Make sure the nrexceptional update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages -= nr; } /* @@ -619,7 +633,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); - mapping->nrpages++; /* * hugetlb pages do not participate in page cache accounting. @@ -1674,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return -EINVAL; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + index = *ppos >> PAGE_SHIFT; prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8ad..603bdd01e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1443,13 +1443,14 @@ static void dissolve_free_huge_page(struct page *page) { spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; - update_and_free_page(h, page); + update_and_free_page(h, head); } spin_unlock(&hugetlb_lock); } @@ -1457,7 +1458,8 @@ static void dissolve_free_huge_page(struct page *page) /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { @@ -1466,7 +1468,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (!hugepages_supported()) return; - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } diff --git a/sound/soc/codecs/nau8825.c b/sound/soc/codecs/nau8825.c index 2e59a85e3..ff566376d 100644 --- a/sound/soc/codecs/nau8825.c +++ b/sound/soc/codecs/nau8825.c @@ -1907,7 +1907,7 @@ static int nau8825_calc_fll_param(unsigned int fll_in, unsigned int fs, /* Calculate the FLL 10-bit integer input and the FLL 16-bit fractional * input based on FDCO, FREF and FLL ratio. */ - fvco = div_u64(fvco << 16, fref * fll_param->ratio); + fvco = div_u64(fvco_max << 16, fref * fll_param->ratio); fll_param->fll_int = (fvco >> 16) & 0x3FF; fll_param->fll_frac = fvco & 0xFFFF; return 0; diff --git a/sound/soc/intel/atom/sst/sst_pvt.c b/sound/soc/intel/atom/sst/sst_pvt.c index adb32fefd..b1e6b8f34 100644 --- a/sound/soc/intel/atom/sst/sst_pvt.c +++ b/sound/soc/intel/atom/sst/sst_pvt.c @@ -279,17 +279,15 @@ int sst_prepare_and_post_msg(struct intel_sst_drv *sst, if (response) { ret = sst_wait_timeout(sst, block); - if (ret < 0) { + if (ret < 0) goto out; - } else if(block->data) { - if (!data) - goto out; - *data = kzalloc(block->size, GFP_KERNEL); - if (!(*data)) { + + if (data && block->data) { + *data = kmemdup(block->data, block->size, GFP_KERNEL); + if (!*data) { ret = -ENOMEM; goto out; - } else - memcpy(data, (void *) block->data, block->size); + } } } out: |