summaryrefslogtreecommitdiff
path: root/kernel/locking
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-03-25 03:53:42 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-03-25 03:53:42 -0300
commit03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
treefa581f6dc1c0596391690d1f67eceef3af8246dc /kernel/locking
parentd4e493caf788ef44982e131ff9c786546904d934 (diff)
Linux-libre 4.5-gnu
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/lockdep.c58
-rw-r--r--kernel/locking/qspinlock.c82
-rw-r--r--kernel/locking/qspinlock_paravirt.h252
-rw-r--r--kernel/locking/qspinlock_stat.h300
-rw-r--r--kernel/locking/rtmutex.c135
5 files changed, 677 insertions, 150 deletions
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace5661..716547fdb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
#define classhashentry(key) (classhash_table + __classhashfn((key)))
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
/*
* We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
* The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
- list_add_tail_rcu(&class->hash_entry, hash_head);
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
* Add it to the global list of classes:
*/
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int trylock_loop)
+ struct held_lock *next, int distance, int *stack_saved)
{
struct lock_list *entry;
int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
- if (!trylock_loop && !save_trace(&trace))
- return 0;
+ if (!*stack_saved) {
+ if (!save_trace(&trace))
+ return 0;
+ *stack_saved = 1;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ /* We drop graph lock, so another thread can overwrite trace. */
+ *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int trylock_loop = 0;
+ int stack_saved = 0;
struct held_lock *hlock;
/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
- distance, trylock_loop))
+ distance, &stack_saved))
return 0;
/*
* Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
- trylock_loop = 1;
}
return 1;
out_bug:
@@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
u64 chain_key)
{
struct lock_class *class = hlock_class(hlock);
- struct list_head *hash_head = chainhashentry(chain_key);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr;
int i, j;
@@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry_rcu(chain, hash_head, entry) {
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2057,7 +2061,7 @@ cache_hit:
/*
* We have to walk the chain again locked - to avoid duplicates:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ hlist_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
graph_unlock();
goto cache_hit;
@@ -2091,7 +2095,7 @@ cache_hit:
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
- list_add_tail_rcu(&chain->entry, hash_head);
+ hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -3875,7 +3879,7 @@ void lockdep_reset(void)
nr_process_chains = 0;
debug_locks = 1;
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
@@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class)
/*
* Unhash the class and remove it from the all_lock_classes list:
*/
- list_del_rcu(&class->hash_entry);
+ hlist_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
void lockdep_free_key_range(void *start, unsigned long size)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i;
int locked;
@@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
*/
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
void lockdep_reset_lock(struct lockdep_map *lock)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i, j;
int locked;
@@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
locked = graph_lock();
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4027,10 +4027,10 @@ void lockdep_init(void)
return;
for (i = 0; i < CLASSHASH_SIZE; i++)
- INIT_LIST_HEAD(classhash_table + i);
+ INIT_HLIST_HEAD(classhash_table + i);
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
lockdep_initialized = 1;
}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a6..393d1874b 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@@ -382,6 +398,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
@@ -406,11 +433,22 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * smp_load_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+ smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+
+locked:
/*
* claim the lock:
*
@@ -422,11 +460,17 @@ queue:
* to grab the lock.
*/
for (;;) {
- if (val != tail) {
+ /* In the PV case we might already have _Q_LOCKED_VAL set */
+ if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_load_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@@ -434,10 +478,12 @@ queue:
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff48..87bb235c3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
@@ -41,6 +55,94 @@ struct pv_node {
};
/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+ == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
return &he->lock;
}
}
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}
/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int waitcnt = 0;
int loop;
+ bool wait_early;
- for (;;) {
- for (loop = SPIN_THRESHOLD; loop; loop--) {
+ /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+ for (;; waitcnt++) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
cpu_relax();
}
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that value
- * so that pv_wait_head() knows to not also try to hash this lock.
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
/*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
- for (;;) {
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (trylock_clear_pending(lock))
+ goto gotlock;
cpu_relax();
}
+ clear_pending(lock);
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
+ WRITE_ONCE(pn->state, vcpu_halted);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
*/
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
+ qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000..640dcecdd
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups
+ * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ counter = (long)(file->f_inode->i_private);
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac;
+
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ stat = 0;
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat) {
+ pr_warn("Could not create 'qlockstat' debugfs directory\n");
+ return 0;
+ }
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat);
+
+ debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat);
+ return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+ int ret = pv_queued_spin_steal_lock(lock);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+#undef queued_spin_trylock
+#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd..3e746607a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
wake_q_add(wake_q, waiter->task);
}
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
schedule();
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
RB_CLEAR_NODE(&waiter.pi_tree_entry);
RB_CLEAR_NODE(&waiter.tree_entry);
- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* Remove pending timer: */
if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;
/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_rt_mutex_unlock(lock);
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_INTERRUPTIBLE);
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return ret;
}