mirror of
https://github.com/ecency/ecency-mobile.git
synced 2024-12-20 20:01:56 +03:00
1508 lines
56 KiB
C++
1508 lines
56 KiB
C++
/*
|
|
* Copyright 2016 Facebook, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// @author Nathan Bronson (ngbronson@fb.com)
|
|
|
|
#pragma once
|
|
|
|
#include <stdint.h>
|
|
#include <atomic>
|
|
#include <thread>
|
|
#include <type_traits>
|
|
#include <folly/Likely.h>
|
|
#include <folly/detail/CacheLocality.h>
|
|
#include <folly/detail/Futex.h>
|
|
#include <folly/portability/Asm.h>
|
|
#include <folly/portability/SysResource.h>
|
|
|
|
// SharedMutex is a reader-writer lock. It is small, very fast, scalable
|
|
// on multi-core, and suitable for use when readers or writers may block.
|
|
// Unlike most other reader-writer locks, its throughput with concurrent
|
|
// readers scales linearly; it is able to acquire and release the lock
|
|
// in shared mode without cache line ping-ponging. It is suitable for
|
|
// a wide range of lock hold times because it starts with spinning,
|
|
// proceeds to using sched_yield with a preemption heuristic, and then
|
|
// waits using futex and precise wakeups.
|
|
//
|
|
// SharedMutex provides all of the methods of folly::RWSpinLock,
|
|
// boost::shared_mutex, boost::upgrade_mutex, and C++14's
|
|
// std::shared_timed_mutex. All operations that can block are available
|
|
// in try, try-for, and try-until (system_clock or steady_clock) versions.
|
|
//
|
|
// SharedMutexReadPriority gives priority to readers,
|
|
// SharedMutexWritePriority gives priority to writers. SharedMutex is an
|
|
// alias for SharedMutexWritePriority, because writer starvation is more
|
|
// likely than reader starvation for the read-heavy workloads targetted
|
|
// by SharedMutex.
|
|
//
|
|
// In my tests SharedMutex is as good or better than the other
|
|
// reader-writer locks in use at Facebook for almost all use cases,
|
|
// sometimes by a wide margin. (If it is rare that there are actually
|
|
// concurrent readers then RWSpinLock can be a few nanoseconds faster.)
|
|
// I compared it to folly::RWSpinLock, folly::RWTicketSpinLock64,
|
|
// boost::shared_mutex, pthread_rwlock_t, and a RWLock that internally uses
|
|
// spinlocks to guard state and pthread_mutex_t+pthread_cond_t to block.
|
|
// (Thrift's ReadWriteMutex is based underneath on pthread_rwlock_t.)
|
|
// It is generally as good or better than the rest when evaluating size,
|
|
// speed, scalability, or latency outliers. In the corner cases where
|
|
// it is not the fastest (such as single-threaded use or heavy write
|
|
// contention) it is never very much worse than the best. See the bottom
|
|
// of folly/test/SharedMutexTest.cpp for lots of microbenchmark results.
|
|
//
|
|
// Comparison to folly::RWSpinLock:
|
|
//
|
|
// * SharedMutex is faster than RWSpinLock when there are actually
|
|
// concurrent read accesses (sometimes much faster), and ~5 nanoseconds
|
|
// slower when there is not actually any contention. SharedMutex is
|
|
// faster in every (benchmarked) scenario where the shared mode of
|
|
// the lock is actually useful.
|
|
//
|
|
// * Concurrent shared access to SharedMutex scales linearly, while total
|
|
// RWSpinLock throughput drops as more threads try to access the lock
|
|
// in shared mode. Under very heavy read contention SharedMutex can
|
|
// be two orders of magnitude faster than RWSpinLock (or any reader
|
|
// writer lock that doesn't use striping or deferral).
|
|
//
|
|
// * SharedMutex can safely protect blocking calls, because after an
|
|
// initial period of spinning it waits using futex().
|
|
//
|
|
// * RWSpinLock prioritizes readers, SharedMutex has both reader- and
|
|
// writer-priority variants, but defaults to write priority.
|
|
//
|
|
// * RWSpinLock's upgradeable mode blocks new readers, while SharedMutex's
|
|
// doesn't. Both semantics are reasonable. The boost documentation
|
|
// doesn't explicitly talk about this behavior (except by omitting
|
|
// any statement that those lock modes conflict), but the boost
|
|
// implementations do allow new readers while the upgradeable mode
|
|
// is held. See https://github.com/boostorg/thread/blob/master/
|
|
// include/boost/thread/pthread/shared_mutex.hpp
|
|
//
|
|
// * RWSpinLock::UpgradedHolder maps to SharedMutex::UpgradeHolder
|
|
// (UpgradeableHolder would be even more pedantically correct).
|
|
// SharedMutex's holders have fewer methods (no reset) and are less
|
|
// tolerant (promotion and downgrade crash if the donor doesn't own
|
|
// the lock, and you must use the default constructor rather than
|
|
// passing a nullptr to the pointer constructor).
|
|
//
|
|
// Both SharedMutex and RWSpinLock provide "exclusive", "upgrade",
|
|
// and "shared" modes. At all times num_threads_holding_exclusive +
|
|
// num_threads_holding_upgrade <= 1, and num_threads_holding_exclusive ==
|
|
// 0 || num_threads_holding_shared == 0. RWSpinLock has the additional
|
|
// constraint that num_threads_holding_shared cannot increase while
|
|
// num_threads_holding_upgrade is non-zero.
|
|
//
|
|
// Comparison to the internal RWLock:
|
|
//
|
|
// * SharedMutex doesn't allow a maximum reader count to be configured,
|
|
// so it can't be used as a semaphore in the same way as RWLock.
|
|
//
|
|
// * SharedMutex is 4 bytes, RWLock is 256.
|
|
//
|
|
// * SharedMutex is as fast or faster than RWLock in all of my
|
|
// microbenchmarks, and has positive rather than negative scalability.
|
|
//
|
|
// * RWLock and SharedMutex are both writer priority locks.
|
|
//
|
|
// * SharedMutex avoids latency outliers as well as RWLock.
|
|
//
|
|
// * SharedMutex uses different names (t != 0 below):
|
|
//
|
|
// RWLock::lock(0) => SharedMutex::lock()
|
|
//
|
|
// RWLock::lock(t) => SharedMutex::try_lock_for(milliseconds(t))
|
|
//
|
|
// RWLock::tryLock() => SharedMutex::try_lock()
|
|
//
|
|
// RWLock::unlock() => SharedMutex::unlock()
|
|
//
|
|
// RWLock::enter(0) => SharedMutex::lock_shared()
|
|
//
|
|
// RWLock::enter(t) =>
|
|
// SharedMutex::try_lock_shared_for(milliseconds(t))
|
|
//
|
|
// RWLock::tryEnter() => SharedMutex::try_lock_shared()
|
|
//
|
|
// RWLock::leave() => SharedMutex::unlock_shared()
|
|
//
|
|
// * RWLock allows the reader count to be adjusted by a value other
|
|
// than 1 during enter() or leave(). SharedMutex doesn't currently
|
|
// implement this feature.
|
|
//
|
|
// * RWLock's methods are marked const, SharedMutex's aren't.
|
|
//
|
|
// Reader-writer locks have the potential to allow concurrent access
|
|
// to shared read-mostly data, but in practice they often provide no
|
|
// improvement over a mutex. The problem is the cache coherence protocol
|
|
// of modern CPUs. Coherence is provided by making sure that when a cache
|
|
// line is written it is present in only one core's cache. Since a memory
|
|
// write is required to acquire a reader-writer lock in shared mode, the
|
|
// cache line holding the lock is invalidated in all of the other caches.
|
|
// This leads to cache misses when another thread wants to acquire or
|
|
// release the lock concurrently. When the RWLock is colocated with the
|
|
// data it protects (common), cache misses can also continue occur when
|
|
// a thread that already holds the lock tries to read the protected data.
|
|
//
|
|
// Ideally, a reader-writer lock would allow multiple cores to acquire
|
|
// and release the lock in shared mode without incurring any cache misses.
|
|
// This requires that each core records its shared access in a cache line
|
|
// that isn't read or written by other read-locking cores. (Writers will
|
|
// have to check all of the cache lines.) Typical server hardware when
|
|
// this comment was written has 16 L1 caches and cache lines of 64 bytes,
|
|
// so a lock striped over all L1 caches would occupy a prohibitive 1024
|
|
// bytes. Nothing says that we need a separate set of per-core memory
|
|
// locations for each lock, however. Each SharedMutex instance is only
|
|
// 4 bytes, but all locks together share a 2K area in which they make a
|
|
// core-local record of lock acquisitions.
|
|
//
|
|
// SharedMutex's strategy of using a shared set of core-local stripes has
|
|
// a potential downside, because it means that acquisition of any lock in
|
|
// write mode can conflict with acquisition of any lock in shared mode.
|
|
// If a lock instance doesn't actually experience concurrency then this
|
|
// downside will outweight the upside of improved scalability for readers.
|
|
// To avoid this problem we dynamically detect concurrent accesses to
|
|
// SharedMutex, and don't start using the deferred mode unless we actually
|
|
// observe concurrency. See kNumSharedToStartDeferring.
|
|
//
|
|
// It is explicitly allowed to call lock_unshared() from a different
|
|
// thread than lock_shared(), so long as they are properly paired.
|
|
// lock_unshared() needs to find the location at which lock_shared()
|
|
// recorded the lock, which might be in the lock itself or in any of
|
|
// the shared slots. If you can conveniently pass state from lock
|
|
// acquisition to release then the fastest mechanism is to std::move
|
|
// the SharedMutex::ReadHolder instance or an SharedMutex::Token (using
|
|
// lock_shared(Token&) and unlock_shared(Token&)). The guard or token
|
|
// will tell unlock_shared where in deferredReaders[] to look for the
|
|
// deferred lock. The Token-less version of unlock_shared() works in all
|
|
// cases, but is optimized for the common (no inter-thread handoff) case.
|
|
//
|
|
// In both read- and write-priority mode, a waiting lock() (exclusive mode)
|
|
// only blocks readers after it has waited for an active upgrade lock to be
|
|
// released; until the upgrade lock is released (or upgraded or downgraded)
|
|
// readers will still be able to enter. Preferences about lock acquisition
|
|
// are not guaranteed to be enforced perfectly (even if they were, there
|
|
// is theoretically the chance that a thread could be arbitrarily suspended
|
|
// between calling lock() and SharedMutex code actually getting executed).
|
|
//
|
|
// try_*_for methods always try at least once, even if the duration
|
|
// is zero or negative. The duration type must be compatible with
|
|
// std::chrono::steady_clock. try_*_until methods also always try at
|
|
// least once. std::chrono::system_clock and std::chrono::steady_clock
|
|
// are supported.
|
|
//
|
|
// If you have observed by profiling that your SharedMutex-s are getting
|
|
// cache misses on deferredReaders[] due to another SharedMutex user, then
|
|
// you can use the tag type to create your own instantiation of the type.
|
|
// The contention threshold (see kNumSharedToStartDeferring) should make
|
|
// this unnecessary in all but the most extreme cases. Make sure to check
|
|
// that the increased icache and dcache footprint of the tagged result is
|
|
// worth it.
|
|
|
|
// SharedMutex's use of thread local storage is as an optimization, so
|
|
// for the case where thread local storage is not supported, define it
|
|
// away.
|
|
#ifndef FOLLY_SHAREDMUTEX_TLS
|
|
#if !FOLLY_MOBILE
|
|
#define FOLLY_SHAREDMUTEX_TLS FOLLY_TLS
|
|
#else
|
|
#define FOLLY_SHAREDMUTEX_TLS
|
|
#endif
|
|
#endif
|
|
|
|
namespace folly {
|
|
|
|
struct SharedMutexToken {
|
|
enum class Type : uint16_t {
|
|
INVALID = 0,
|
|
INLINE_SHARED,
|
|
DEFERRED_SHARED,
|
|
};
|
|
|
|
Type type_;
|
|
uint16_t slot_;
|
|
};
|
|
|
|
template <bool ReaderPriority,
|
|
typename Tag_ = void,
|
|
template <typename> class Atom = std::atomic,
|
|
bool BlockImmediately = false>
|
|
class SharedMutexImpl {
|
|
public:
|
|
static constexpr bool kReaderPriority = ReaderPriority;
|
|
typedef Tag_ Tag;
|
|
|
|
typedef SharedMutexToken Token;
|
|
|
|
class ReadHolder;
|
|
class UpgradeHolder;
|
|
class WriteHolder;
|
|
|
|
constexpr SharedMutexImpl() : state_(0) {}
|
|
|
|
SharedMutexImpl(const SharedMutexImpl&) = delete;
|
|
SharedMutexImpl(SharedMutexImpl&&) = delete;
|
|
SharedMutexImpl& operator = (const SharedMutexImpl&) = delete;
|
|
SharedMutexImpl& operator = (SharedMutexImpl&&) = delete;
|
|
|
|
// It is an error to destroy an SharedMutex that still has
|
|
// any outstanding locks. This is checked if NDEBUG isn't defined.
|
|
// SharedMutex's exclusive mode can be safely used to guard the lock's
|
|
// own destruction. If, for example, you acquire the lock in exclusive
|
|
// mode and then observe that the object containing the lock is no longer
|
|
// needed, you can unlock() and then immediately destroy the lock.
|
|
// See https://sourceware.org/bugzilla/show_bug.cgi?id=13690 for a
|
|
// description about why this property needs to be explicitly mentioned.
|
|
~SharedMutexImpl() {
|
|
auto state = state_.load(std::memory_order_relaxed);
|
|
if (UNLIKELY((state & kHasS) != 0)) {
|
|
cleanupTokenlessSharedDeferred(state);
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
// if a futexWait fails to go to sleep because the value has been
|
|
// changed, we don't necessarily clean up the wait bits, so it is
|
|
// possible they will be set here in a correct system
|
|
assert((state & ~(kWaitingAny | kMayDefer)) == 0);
|
|
if ((state & kMayDefer) != 0) {
|
|
for (uint32_t slot = 0; slot < kMaxDeferredReaders; ++slot) {
|
|
auto slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
|
|
assert(!slotValueIsThis(slotValue));
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void lock() {
|
|
WaitForever ctx;
|
|
(void)lockExclusiveImpl(kHasSolo, ctx);
|
|
}
|
|
|
|
bool try_lock() {
|
|
WaitNever ctx;
|
|
return lockExclusiveImpl(kHasSolo, ctx);
|
|
}
|
|
|
|
template <class Rep, class Period>
|
|
bool try_lock_for(const std::chrono::duration<Rep, Period>& duration) {
|
|
WaitForDuration<Rep, Period> ctx(duration);
|
|
return lockExclusiveImpl(kHasSolo, ctx);
|
|
}
|
|
|
|
template <class Clock, class Duration>
|
|
bool try_lock_until(
|
|
const std::chrono::time_point<Clock, Duration>& absDeadline) {
|
|
WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
|
|
return lockExclusiveImpl(kHasSolo, ctx);
|
|
}
|
|
|
|
void unlock() {
|
|
// It is possible that we have a left-over kWaitingNotS if the last
|
|
// unlock_shared() that let our matching lock() complete finished
|
|
// releasing before lock()'s futexWait went to sleep. Clean it up now
|
|
auto state = (state_ &= ~(kWaitingNotS | kPrevDefer | kHasE));
|
|
assert((state & ~kWaitingAny) == 0);
|
|
wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
|
|
}
|
|
|
|
// Managing the token yourself makes unlock_shared a bit faster
|
|
|
|
void lock_shared() {
|
|
WaitForever ctx;
|
|
(void)lockSharedImpl(nullptr, ctx);
|
|
}
|
|
|
|
void lock_shared(Token& token) {
|
|
WaitForever ctx;
|
|
(void)lockSharedImpl(&token, ctx);
|
|
}
|
|
|
|
bool try_lock_shared() {
|
|
WaitNever ctx;
|
|
return lockSharedImpl(nullptr, ctx);
|
|
}
|
|
|
|
bool try_lock_shared(Token& token) {
|
|
WaitNever ctx;
|
|
return lockSharedImpl(&token, ctx);
|
|
}
|
|
|
|
template <class Rep, class Period>
|
|
bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration) {
|
|
WaitForDuration<Rep, Period> ctx(duration);
|
|
return lockSharedImpl(nullptr, ctx);
|
|
}
|
|
|
|
template <class Rep, class Period>
|
|
bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& duration,
|
|
Token& token) {
|
|
WaitForDuration<Rep, Period> ctx(duration);
|
|
return lockSharedImpl(&token, ctx);
|
|
}
|
|
|
|
template <class Clock, class Duration>
|
|
bool try_lock_shared_until(
|
|
const std::chrono::time_point<Clock, Duration>& absDeadline) {
|
|
WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
|
|
return lockSharedImpl(nullptr, ctx);
|
|
}
|
|
|
|
template <class Clock, class Duration>
|
|
bool try_lock_shared_until(
|
|
const std::chrono::time_point<Clock, Duration>& absDeadline,
|
|
Token& token) {
|
|
WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
|
|
return lockSharedImpl(&token, ctx);
|
|
}
|
|
|
|
void unlock_shared() {
|
|
auto state = state_.load(std::memory_order_acquire);
|
|
|
|
// kPrevDefer can only be set if HasE or BegunE is set
|
|
assert((state & (kPrevDefer | kHasE | kBegunE)) != kPrevDefer);
|
|
|
|
// lock() strips kMayDefer immediately, but then copies it to
|
|
// kPrevDefer so we can tell if the pre-lock() lock_shared() might
|
|
// have deferred
|
|
if ((state & (kMayDefer | kPrevDefer)) == 0 ||
|
|
!tryUnlockTokenlessSharedDeferred()) {
|
|
// Matching lock_shared() couldn't have deferred, or the deferred
|
|
// lock has already been inlined by applyDeferredReaders()
|
|
unlockSharedInline();
|
|
}
|
|
}
|
|
|
|
void unlock_shared(Token& token) {
|
|
assert(token.type_ == Token::Type::INLINE_SHARED ||
|
|
token.type_ == Token::Type::DEFERRED_SHARED);
|
|
|
|
if (token.type_ != Token::Type::DEFERRED_SHARED ||
|
|
!tryUnlockSharedDeferred(token.slot_)) {
|
|
unlockSharedInline();
|
|
}
|
|
#ifndef NDEBUG
|
|
token.type_ = Token::Type::INVALID;
|
|
#endif
|
|
}
|
|
|
|
void unlock_and_lock_shared() {
|
|
// We can't use state_ -=, because we need to clear 2 bits (1 of which
|
|
// has an uncertain initial state) and set 1 other. We might as well
|
|
// clear the relevant wake bits at the same time. Note that since S
|
|
// doesn't block the beginning of a transition to E (writer priority
|
|
// can cut off new S, reader priority grabs BegunE and blocks deferred
|
|
// S) we need to wake E as well.
|
|
auto state = state_.load(std::memory_order_acquire);
|
|
do {
|
|
assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
|
|
} while (!state_.compare_exchange_strong(
|
|
state, (state & ~(kWaitingAny | kPrevDefer | kHasE)) + kIncrHasS));
|
|
if ((state & (kWaitingE | kWaitingU | kWaitingS)) != 0) {
|
|
futexWakeAll(kWaitingE | kWaitingU | kWaitingS);
|
|
}
|
|
}
|
|
|
|
void unlock_and_lock_shared(Token& token) {
|
|
unlock_and_lock_shared();
|
|
token.type_ = Token::Type::INLINE_SHARED;
|
|
}
|
|
|
|
void lock_upgrade() {
|
|
WaitForever ctx;
|
|
(void)lockUpgradeImpl(ctx);
|
|
}
|
|
|
|
bool try_lock_upgrade() {
|
|
WaitNever ctx;
|
|
return lockUpgradeImpl(ctx);
|
|
}
|
|
|
|
template <class Rep, class Period>
|
|
bool try_lock_upgrade_for(
|
|
const std::chrono::duration<Rep, Period>& duration) {
|
|
WaitForDuration<Rep, Period> ctx(duration);
|
|
return lockUpgradeImpl(ctx);
|
|
}
|
|
|
|
template <class Clock, class Duration>
|
|
bool try_lock_upgrade_until(
|
|
const std::chrono::time_point<Clock, Duration>& absDeadline) {
|
|
WaitUntilDeadline<Clock, Duration> ctx{absDeadline};
|
|
return lockUpgradeImpl(ctx);
|
|
}
|
|
|
|
void unlock_upgrade() {
|
|
auto state = (state_ -= kHasU);
|
|
assert((state & (kWaitingNotS | kHasSolo)) == 0);
|
|
wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
|
|
}
|
|
|
|
void unlock_upgrade_and_lock() {
|
|
// no waiting necessary, so waitMask is empty
|
|
WaitForever ctx;
|
|
(void)lockExclusiveImpl(0, ctx);
|
|
}
|
|
|
|
void unlock_upgrade_and_lock_shared() {
|
|
auto state = (state_ -= kHasU - kIncrHasS);
|
|
assert((state & (kWaitingNotS | kHasSolo)) == 0);
|
|
wakeRegisteredWaiters(state, kWaitingE | kWaitingU);
|
|
}
|
|
|
|
void unlock_upgrade_and_lock_shared(Token& token) {
|
|
unlock_upgrade_and_lock_shared();
|
|
token.type_ = Token::Type::INLINE_SHARED;
|
|
}
|
|
|
|
void unlock_and_lock_upgrade() {
|
|
// We can't use state_ -=, because we need to clear 2 bits (1 of
|
|
// which has an uncertain initial state) and set 1 other. We might
|
|
// as well clear the relevant wake bits at the same time.
|
|
auto state = state_.load(std::memory_order_acquire);
|
|
while (true) {
|
|
assert((state & ~(kWaitingAny | kPrevDefer)) == kHasE);
|
|
auto after =
|
|
(state & ~(kWaitingNotS | kWaitingS | kPrevDefer | kHasE)) + kHasU;
|
|
if (state_.compare_exchange_strong(state, after)) {
|
|
if ((state & kWaitingS) != 0) {
|
|
futexWakeAll(kWaitingS);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
typedef typename folly::detail::Futex<Atom> Futex;
|
|
|
|
// Internally we use four kinds of wait contexts. These are structs
|
|
// that provide a doWait method that returns true if a futex wake
|
|
// was issued that intersects with the waitMask, false if there was a
|
|
// timeout and no more waiting should be performed. Spinning occurs
|
|
// before the wait context is invoked.
|
|
|
|
struct WaitForever {
|
|
bool canBlock() { return true; }
|
|
bool canTimeOut() { return false; }
|
|
bool shouldTimeOut() { return false; }
|
|
|
|
bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
|
|
futex.futexWait(expected, waitMask);
|
|
return true;
|
|
}
|
|
};
|
|
|
|
struct WaitNever {
|
|
bool canBlock() { return false; }
|
|
bool canTimeOut() { return true; }
|
|
bool shouldTimeOut() { return true; }
|
|
|
|
bool doWait(Futex& /* futex */,
|
|
uint32_t /* expected */,
|
|
uint32_t /* waitMask */) {
|
|
return false;
|
|
}
|
|
};
|
|
|
|
template <class Rep, class Period>
|
|
struct WaitForDuration {
|
|
std::chrono::duration<Rep, Period> duration_;
|
|
bool deadlineComputed_;
|
|
std::chrono::steady_clock::time_point deadline_;
|
|
|
|
explicit WaitForDuration(const std::chrono::duration<Rep, Period>& duration)
|
|
: duration_(duration), deadlineComputed_(false) {}
|
|
|
|
std::chrono::steady_clock::time_point deadline() {
|
|
if (!deadlineComputed_) {
|
|
deadline_ = std::chrono::steady_clock::now() + duration_;
|
|
deadlineComputed_ = true;
|
|
}
|
|
return deadline_;
|
|
}
|
|
|
|
bool canBlock() { return duration_.count() > 0; }
|
|
bool canTimeOut() { return true; }
|
|
|
|
bool shouldTimeOut() {
|
|
return std::chrono::steady_clock::now() > deadline();
|
|
}
|
|
|
|
bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
|
|
auto result = futex.futexWaitUntil(expected, deadline(), waitMask);
|
|
return result != folly::detail::FutexResult::TIMEDOUT;
|
|
}
|
|
};
|
|
|
|
template <class Clock, class Duration>
|
|
struct WaitUntilDeadline {
|
|
std::chrono::time_point<Clock, Duration> absDeadline_;
|
|
|
|
bool canBlock() { return true; }
|
|
bool canTimeOut() { return true; }
|
|
bool shouldTimeOut() { return Clock::now() > absDeadline_; }
|
|
|
|
bool doWait(Futex& futex, uint32_t expected, uint32_t waitMask) {
|
|
auto result = futex.futexWaitUntil(expected, absDeadline_, waitMask);
|
|
return result != folly::detail::FutexResult::TIMEDOUT;
|
|
}
|
|
};
|
|
|
|
// 32 bits of state
|
|
Futex state_;
|
|
|
|
// S count needs to be on the end, because we explicitly allow it to
|
|
// underflow. This can occur while we are in the middle of applying
|
|
// deferred locks (we remove them from deferredReaders[] before
|
|
// inlining them), or during token-less unlock_shared() if a racing
|
|
// lock_shared();unlock_shared() moves the deferredReaders slot while
|
|
// the first unlock_shared() is scanning. The former case is cleaned
|
|
// up before we finish applying the locks. The latter case can persist
|
|
// until destruction, when it is cleaned up.
|
|
static constexpr uint32_t kIncrHasS = 1 << 10;
|
|
static constexpr uint32_t kHasS = ~(kIncrHasS - 1);
|
|
|
|
// If false, then there are definitely no deferred read locks for this
|
|
// instance. Cleared after initialization and when exclusively locked.
|
|
static constexpr uint32_t kMayDefer = 1 << 9;
|
|
|
|
// lock() cleared kMayDefer as soon as it starts draining readers (so
|
|
// that it doesn't have to do a second CAS once drain completes), but
|
|
// unlock_shared() still needs to know whether to scan deferredReaders[]
|
|
// or not. We copy kMayDefer to kPrevDefer when setting kHasE or
|
|
// kBegunE, and clear it when clearing those bits.
|
|
static constexpr uint32_t kPrevDefer = 1 << 8;
|
|
|
|
// Exclusive-locked blocks all read locks and write locks. This bit
|
|
// may be set before all readers have finished, but in that case the
|
|
// thread that sets it won't return to the caller until all read locks
|
|
// have been released.
|
|
static constexpr uint32_t kHasE = 1 << 7;
|
|
|
|
// Exclusive-draining means that lock() is waiting for existing readers
|
|
// to leave, but that new readers may still acquire shared access.
|
|
// This is only used in reader priority mode. New readers during
|
|
// drain must be inline. The difference between this and kHasU is that
|
|
// kBegunE prevents kMayDefer from being set.
|
|
static constexpr uint32_t kBegunE = 1 << 6;
|
|
|
|
// At most one thread may have either exclusive or upgrade lock
|
|
// ownership. Unlike exclusive mode, ownership of the lock in upgrade
|
|
// mode doesn't preclude other threads holding the lock in shared mode.
|
|
// boost's concept for this doesn't explicitly say whether new shared
|
|
// locks can be acquired one lock_upgrade has succeeded, but doesn't
|
|
// list that as disallowed. RWSpinLock disallows new read locks after
|
|
// lock_upgrade has been acquired, but the boost implementation doesn't.
|
|
// We choose the latter.
|
|
static constexpr uint32_t kHasU = 1 << 5;
|
|
|
|
// There are three states that we consider to be "solo", in that they
|
|
// cannot coexist with other solo states. These are kHasE, kBegunE,
|
|
// and kHasU. Note that S doesn't conflict with any of these, because
|
|
// setting the kHasE is only one of the two steps needed to actually
|
|
// acquire the lock in exclusive mode (the other is draining the existing
|
|
// S holders).
|
|
static constexpr uint32_t kHasSolo = kHasE | kBegunE | kHasU;
|
|
|
|
// Once a thread sets kHasE it needs to wait for the current readers
|
|
// to exit the lock. We give this a separate wait identity from the
|
|
// waiting to set kHasE so that we can perform partial wakeups (wake
|
|
// one instead of wake all).
|
|
static constexpr uint32_t kWaitingNotS = 1 << 4;
|
|
|
|
// When waking writers we can either wake them all, in which case we
|
|
// can clear kWaitingE, or we can call futexWake(1). futexWake tells
|
|
// us if anybody woke up, but even if we detect that nobody woke up we
|
|
// can't clear the bit after the fact without issuing another wakeup.
|
|
// To avoid thundering herds when there are lots of pending lock()
|
|
// without needing to call futexWake twice when there is only one
|
|
// waiter, kWaitingE actually encodes if we have observed multiple
|
|
// concurrent waiters. Tricky: ABA issues on futexWait mean that when
|
|
// we see kWaitingESingle we can't assume that there is only one.
|
|
static constexpr uint32_t kWaitingESingle = 1 << 2;
|
|
static constexpr uint32_t kWaitingEMultiple = 1 << 3;
|
|
static constexpr uint32_t kWaitingE = kWaitingESingle | kWaitingEMultiple;
|
|
|
|
// kWaitingU is essentially a 1 bit saturating counter. It always
|
|
// requires a wakeAll.
|
|
static constexpr uint32_t kWaitingU = 1 << 1;
|
|
|
|
// All blocked lock_shared() should be awoken, so it is correct (not
|
|
// suboptimal) to wakeAll if there are any shared readers.
|
|
static constexpr uint32_t kWaitingS = 1 << 0;
|
|
|
|
// kWaitingAny is a mask of all of the bits that record the state of
|
|
// threads, rather than the state of the lock. It is convenient to be
|
|
// able to mask them off during asserts.
|
|
static constexpr uint32_t kWaitingAny =
|
|
kWaitingNotS | kWaitingE | kWaitingU | kWaitingS;
|
|
|
|
// The reader count at which a reader will attempt to use the lock
|
|
// in deferred mode. If this value is 2, then the second concurrent
|
|
// reader will set kMayDefer and use deferredReaders[]. kMayDefer is
|
|
// cleared during exclusive access, so this threshold must be reached
|
|
// each time a lock is held in exclusive mode.
|
|
static constexpr uint32_t kNumSharedToStartDeferring = 2;
|
|
|
|
// The typical number of spins that a thread will wait for a state
|
|
// transition. There is no bound on the number of threads that can wait
|
|
// for a writer, so we are pretty conservative here to limit the chance
|
|
// that we are starving the writer of CPU. Each spin is 6 or 7 nanos,
|
|
// almost all of which is in the pause instruction.
|
|
static constexpr uint32_t kMaxSpinCount = !BlockImmediately ? 1000 : 2;
|
|
|
|
// The maximum number of soft yields before falling back to futex.
|
|
// If the preemption heuristic is activated we will fall back before
|
|
// this. A soft yield takes ~900 nanos (two sched_yield plus a call
|
|
// to getrusage, with checks of the goal at each step). Soft yields
|
|
// aren't compatible with deterministic execution under test (unlike
|
|
// futexWaitUntil, which has a capricious but deterministic back end).
|
|
static constexpr uint32_t kMaxSoftYieldCount = !BlockImmediately ? 1000 : 0;
|
|
|
|
// If AccessSpreader assigns indexes from 0..k*n-1 on a system where some
|
|
// level of the memory hierarchy is symmetrically divided into k pieces
|
|
// (NUMA nodes, last-level caches, L1 caches, ...), then slot indexes
|
|
// that are the same after integer division by k share that resource.
|
|
// Our strategy for deferred readers is to probe up to numSlots/4 slots,
|
|
// using the full granularity of AccessSpreader for the start slot
|
|
// and then search outward. We can use AccessSpreader::current(n)
|
|
// without managing our own spreader if kMaxDeferredReaders <=
|
|
// AccessSpreader::kMaxCpus, which is currently 128.
|
|
//
|
|
// Our 2-socket E5-2660 machines have 8 L1 caches on each chip,
|
|
// with 64 byte cache lines. That means we need 64*16 bytes of
|
|
// deferredReaders[] to give each L1 its own playground. On x86_64
|
|
// each DeferredReaderSlot is 8 bytes, so we need kMaxDeferredReaders
|
|
// * kDeferredSeparationFactor >= 64 * 16 / 8 == 128. If
|
|
// kDeferredSearchDistance * kDeferredSeparationFactor <=
|
|
// 64 / 8 then we will search only within a single cache line, which
|
|
// guarantees we won't have inter-L1 contention. We give ourselves
|
|
// a factor of 2 on the core count, which should hold us for a couple
|
|
// processor generations. deferredReaders[] is 2048 bytes currently.
|
|
public:
|
|
static constexpr uint32_t kMaxDeferredReaders = 64;
|
|
static constexpr uint32_t kDeferredSearchDistance = 2;
|
|
static constexpr uint32_t kDeferredSeparationFactor = 4;
|
|
|
|
private:
|
|
|
|
static_assert(!(kMaxDeferredReaders & (kMaxDeferredReaders - 1)),
|
|
"kMaxDeferredReaders must be a power of 2");
|
|
static_assert(!(kDeferredSearchDistance & (kDeferredSearchDistance - 1)),
|
|
"kDeferredSearchDistance must be a power of 2");
|
|
|
|
// The number of deferred locks that can be simultaneously acquired
|
|
// by a thread via the token-less methods without performing any heap
|
|
// allocations. Each of these costs 3 pointers (24 bytes, probably)
|
|
// per thread. There's not much point in making this larger than
|
|
// kDeferredSearchDistance.
|
|
static constexpr uint32_t kTokenStackTLSCapacity = 2;
|
|
|
|
// We need to make sure that if there is a lock_shared()
|
|
// and lock_shared(token) followed by unlock_shared() and
|
|
// unlock_shared(token), the token-less unlock doesn't null
|
|
// out deferredReaders[token.slot_]. If we allowed that, then
|
|
// unlock_shared(token) wouldn't be able to assume that its lock
|
|
// had been inlined by applyDeferredReaders when it finds that
|
|
// deferredReaders[token.slot_] no longer points to this. We accomplish
|
|
// this by stealing bit 0 from the pointer to record that the slot's
|
|
// element has no token, hence our use of uintptr_t in deferredReaders[].
|
|
static constexpr uintptr_t kTokenless = 0x1;
|
|
|
|
// This is the starting location for Token-less unlock_shared().
|
|
static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastTokenlessSlot;
|
|
|
|
// Last deferred reader slot used.
|
|
static FOLLY_SHAREDMUTEX_TLS uint32_t tls_lastDeferredReaderSlot;
|
|
|
|
|
|
// Only indexes divisible by kDeferredSeparationFactor are used.
|
|
// If any of those elements points to a SharedMutexImpl, then it
|
|
// should be considered that there is a shared lock on that instance.
|
|
// See kTokenless.
|
|
public:
|
|
typedef Atom<uintptr_t> DeferredReaderSlot;
|
|
|
|
private:
|
|
FOLLY_ALIGN_TO_AVOID_FALSE_SHARING static DeferredReaderSlot deferredReaders
|
|
[kMaxDeferredReaders *
|
|
kDeferredSeparationFactor];
|
|
|
|
// Performs an exclusive lock, waiting for state_ & waitMask to be
|
|
// zero first
|
|
template <class WaitContext>
|
|
bool lockExclusiveImpl(uint32_t preconditionGoalMask, WaitContext& ctx) {
|
|
uint32_t state = state_.load(std::memory_order_acquire);
|
|
if (LIKELY(
|
|
(state & (preconditionGoalMask | kMayDefer | kHasS)) == 0 &&
|
|
state_.compare_exchange_strong(state, (state | kHasE) & ~kHasU))) {
|
|
return true;
|
|
} else {
|
|
return lockExclusiveImpl(state, preconditionGoalMask, ctx);
|
|
}
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool lockExclusiveImpl(uint32_t& state,
|
|
uint32_t preconditionGoalMask,
|
|
WaitContext& ctx) {
|
|
while (true) {
|
|
if (UNLIKELY((state & preconditionGoalMask) != 0) &&
|
|
!waitForZeroBits(state, preconditionGoalMask, kWaitingE, ctx) &&
|
|
ctx.canTimeOut()) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t after = (state & kMayDefer) == 0 ? 0 : kPrevDefer;
|
|
if (!ReaderPriority || (state & (kMayDefer | kHasS)) == 0) {
|
|
// Block readers immediately, either because we are in write
|
|
// priority mode or because we can acquire the lock in one
|
|
// step. Note that if state has kHasU, then we are doing an
|
|
// unlock_upgrade_and_lock() and we should clear it (reader
|
|
// priority branch also does this).
|
|
after |= (state | kHasE) & ~(kHasU | kMayDefer);
|
|
} else {
|
|
after |= (state | kBegunE) & ~(kHasU | kMayDefer);
|
|
}
|
|
if (state_.compare_exchange_strong(state, after)) {
|
|
auto before = state;
|
|
state = after;
|
|
|
|
// If we set kHasE (writer priority) then no new readers can
|
|
// arrive. If we set kBegunE then they can still enter, but
|
|
// they must be inline. Either way we need to either spin on
|
|
// deferredReaders[] slots, or inline them so that we can wait on
|
|
// kHasS to zero itself. deferredReaders[] is pointers, which on
|
|
// x86_64 are bigger than futex() can handle, so we inline the
|
|
// deferred locks instead of trying to futexWait on each slot.
|
|
// Readers are responsible for rechecking state_ after recording
|
|
// a deferred read to avoid atomicity problems between the state_
|
|
// CAS and applyDeferredReader's reads of deferredReaders[].
|
|
if (UNLIKELY((before & kMayDefer) != 0)) {
|
|
applyDeferredReaders(state, ctx);
|
|
}
|
|
while (true) {
|
|
assert((state & (kHasE | kBegunE)) != 0 && (state & kHasU) == 0);
|
|
if (UNLIKELY((state & kHasS) != 0) &&
|
|
!waitForZeroBits(state, kHasS, kWaitingNotS, ctx) &&
|
|
ctx.canTimeOut()) {
|
|
// Ugh. We blocked new readers and other writers for a while,
|
|
// but were unable to complete. Move on. On the plus side
|
|
// we can clear kWaitingNotS because nobody else can piggyback
|
|
// on it.
|
|
state = (state_ &= ~(kPrevDefer | kHasE | kBegunE | kWaitingNotS));
|
|
wakeRegisteredWaiters(state, kWaitingE | kWaitingU | kWaitingS);
|
|
return false;
|
|
}
|
|
|
|
if (ReaderPriority && (state & kHasE) == 0) {
|
|
assert((state & kBegunE) != 0);
|
|
if (!state_.compare_exchange_strong(state,
|
|
(state & ~kBegunE) | kHasE)) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool waitForZeroBits(uint32_t& state,
|
|
uint32_t goal,
|
|
uint32_t waitMask,
|
|
WaitContext& ctx) {
|
|
uint32_t spinCount = 0;
|
|
while (true) {
|
|
state = state_.load(std::memory_order_acquire);
|
|
if ((state & goal) == 0) {
|
|
return true;
|
|
}
|
|
asm_volatile_pause();
|
|
++spinCount;
|
|
if (UNLIKELY(spinCount >= kMaxSpinCount)) {
|
|
return ctx.canBlock() &&
|
|
yieldWaitForZeroBits(state, goal, waitMask, ctx);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool yieldWaitForZeroBits(uint32_t& state,
|
|
uint32_t goal,
|
|
uint32_t waitMask,
|
|
WaitContext& ctx) {
|
|
#ifdef RUSAGE_THREAD
|
|
struct rusage usage;
|
|
long before = -1;
|
|
#endif
|
|
for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
|
|
++yieldCount) {
|
|
for (int softState = 0; softState < 3; ++softState) {
|
|
if (softState < 2) {
|
|
std::this_thread::yield();
|
|
} else {
|
|
#ifdef RUSAGE_THREAD
|
|
getrusage(RUSAGE_THREAD, &usage);
|
|
#endif
|
|
}
|
|
if (((state = state_.load(std::memory_order_acquire)) & goal) == 0) {
|
|
return true;
|
|
}
|
|
if (ctx.shouldTimeOut()) {
|
|
return false;
|
|
}
|
|
}
|
|
#ifdef RUSAGE_THREAD
|
|
if (before >= 0 && usage.ru_nivcsw >= before + 2) {
|
|
// One involuntary csw might just be occasional background work,
|
|
// but if we get two in a row then we guess that there is someone
|
|
// else who can profitably use this CPU. Fall back to futex
|
|
break;
|
|
}
|
|
before = usage.ru_nivcsw;
|
|
#endif
|
|
}
|
|
return futexWaitForZeroBits(state, goal, waitMask, ctx);
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool futexWaitForZeroBits(uint32_t& state,
|
|
uint32_t goal,
|
|
uint32_t waitMask,
|
|
WaitContext& ctx) {
|
|
assert(waitMask == kWaitingNotS || waitMask == kWaitingE ||
|
|
waitMask == kWaitingU || waitMask == kWaitingS);
|
|
|
|
while (true) {
|
|
state = state_.load(std::memory_order_acquire);
|
|
if ((state & goal) == 0) {
|
|
return true;
|
|
}
|
|
|
|
auto after = state;
|
|
if (waitMask == kWaitingE) {
|
|
if ((state & kWaitingESingle) != 0) {
|
|
after |= kWaitingEMultiple;
|
|
} else {
|
|
after |= kWaitingESingle;
|
|
}
|
|
} else {
|
|
after |= waitMask;
|
|
}
|
|
|
|
// CAS is better than atomic |= here, because it lets us avoid
|
|
// setting the wait flag when the goal is concurrently achieved
|
|
if (after != state && !state_.compare_exchange_strong(state, after)) {
|
|
continue;
|
|
}
|
|
|
|
if (!ctx.doWait(state_, after, waitMask)) {
|
|
// timed out
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Wakes up waiters registered in state_ as appropriate, clearing the
|
|
// awaiting bits for anybody that was awoken. Tries to perform direct
|
|
// single wakeup of an exclusive waiter if appropriate
|
|
void wakeRegisteredWaiters(uint32_t& state, uint32_t wakeMask) {
|
|
if (UNLIKELY((state & wakeMask) != 0)) {
|
|
wakeRegisteredWaitersImpl(state, wakeMask);
|
|
}
|
|
}
|
|
|
|
void wakeRegisteredWaitersImpl(uint32_t& state, uint32_t wakeMask) {
|
|
// If there are multiple lock() pending only one of them will actually
|
|
// get to wake up, so issuing futexWakeAll will make a thundering herd.
|
|
// There's nothing stopping us from issuing futexWake(1) instead,
|
|
// so long as the wait bits are still an accurate reflection of
|
|
// the waiters. If we notice (via futexWake's return value) that
|
|
// nobody woke up then we can try again with the normal wake-all path.
|
|
// Note that we can't just clear the bits at that point; we need to
|
|
// clear the bits and then issue another wakeup.
|
|
//
|
|
// It is possible that we wake an E waiter but an outside S grabs the
|
|
// lock instead, at which point we should wake pending U and S waiters.
|
|
// Rather than tracking state to make the failing E regenerate the
|
|
// wakeup, we just disable the optimization in the case that there
|
|
// are waiting U or S that we are eligible to wake.
|
|
if ((wakeMask & kWaitingE) == kWaitingE &&
|
|
(state & wakeMask) == kWaitingE &&
|
|
state_.futexWake(1, kWaitingE) > 0) {
|
|
// somebody woke up, so leave state_ as is and clear it later
|
|
return;
|
|
}
|
|
|
|
if ((state & wakeMask) != 0) {
|
|
auto prev = state_.fetch_and(~wakeMask);
|
|
if ((prev & wakeMask) != 0) {
|
|
futexWakeAll(wakeMask);
|
|
}
|
|
state = prev & ~wakeMask;
|
|
}
|
|
}
|
|
|
|
void futexWakeAll(uint32_t wakeMask) {
|
|
state_.futexWake(std::numeric_limits<int>::max(), wakeMask);
|
|
}
|
|
|
|
DeferredReaderSlot* deferredReader(uint32_t slot) {
|
|
return &deferredReaders[slot * kDeferredSeparationFactor];
|
|
}
|
|
|
|
uintptr_t tokenfulSlotValue() { return reinterpret_cast<uintptr_t>(this); }
|
|
|
|
uintptr_t tokenlessSlotValue() { return tokenfulSlotValue() | kTokenless; }
|
|
|
|
bool slotValueIsThis(uintptr_t slotValue) {
|
|
return (slotValue & ~kTokenless) == tokenfulSlotValue();
|
|
}
|
|
|
|
// Clears any deferredReaders[] that point to this, adjusting the inline
|
|
// shared lock count to compensate. Does some spinning and yielding
|
|
// to avoid the work. Always finishes the application, even if ctx
|
|
// times out.
|
|
template <class WaitContext>
|
|
void applyDeferredReaders(uint32_t& state, WaitContext& ctx) {
|
|
uint32_t slot = 0;
|
|
|
|
uint32_t spinCount = 0;
|
|
while (true) {
|
|
while (!slotValueIsThis(
|
|
deferredReader(slot)->load(std::memory_order_acquire))) {
|
|
if (++slot == kMaxDeferredReaders) {
|
|
return;
|
|
}
|
|
}
|
|
asm_pause();
|
|
if (UNLIKELY(++spinCount >= kMaxSpinCount)) {
|
|
applyDeferredReaders(state, ctx, slot);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class WaitContext>
|
|
void applyDeferredReaders(uint32_t& state, WaitContext& ctx, uint32_t slot) {
|
|
|
|
#ifdef RUSAGE_THREAD
|
|
struct rusage usage;
|
|
long before = -1;
|
|
#endif
|
|
for (uint32_t yieldCount = 0; yieldCount < kMaxSoftYieldCount;
|
|
++yieldCount) {
|
|
for (int softState = 0; softState < 3; ++softState) {
|
|
if (softState < 2) {
|
|
std::this_thread::yield();
|
|
} else {
|
|
#ifdef RUSAGE_THREAD
|
|
getrusage(RUSAGE_THREAD, &usage);
|
|
#endif
|
|
}
|
|
while (!slotValueIsThis(
|
|
deferredReader(slot)->load(std::memory_order_acquire))) {
|
|
if (++slot == kMaxDeferredReaders) {
|
|
return;
|
|
}
|
|
}
|
|
if (ctx.shouldTimeOut()) {
|
|
// finish applying immediately on timeout
|
|
break;
|
|
}
|
|
}
|
|
#ifdef RUSAGE_THREAD
|
|
if (before >= 0 && usage.ru_nivcsw >= before + 2) {
|
|
// heuristic says run queue is not empty
|
|
break;
|
|
}
|
|
before = usage.ru_nivcsw;
|
|
#endif
|
|
}
|
|
|
|
uint32_t movedSlotCount = 0;
|
|
for (; slot < kMaxDeferredReaders; ++slot) {
|
|
auto slotPtr = deferredReader(slot);
|
|
auto slotValue = slotPtr->load(std::memory_order_acquire);
|
|
if (slotValueIsThis(slotValue) &&
|
|
slotPtr->compare_exchange_strong(slotValue, 0)) {
|
|
++movedSlotCount;
|
|
}
|
|
}
|
|
|
|
if (movedSlotCount > 0) {
|
|
state = (state_ += movedSlotCount * kIncrHasS);
|
|
}
|
|
assert((state & (kHasE | kBegunE)) != 0);
|
|
|
|
// if state + kIncrHasS overflows (off the end of state) then either
|
|
// we have 2^(32-9) readers (almost certainly an application bug)
|
|
// or we had an underflow (also a bug)
|
|
assert(state < state + kIncrHasS);
|
|
}
|
|
|
|
// It is straightfoward to make a token-less lock_shared() and
|
|
// unlock_shared() either by making the token-less version always use
|
|
// INLINE_SHARED mode or by removing the token version. Supporting
|
|
// deferred operation for both types is trickier than it appears, because
|
|
// the purpose of the token it so that unlock_shared doesn't have to
|
|
// look in other slots for its deferred lock. Token-less unlock_shared
|
|
// might place a deferred lock in one place and then release a different
|
|
// slot that was originally used by the token-ful version. If this was
|
|
// important we could solve the problem by differentiating the deferred
|
|
// locks so that cross-variety release wouldn't occur. The best way
|
|
// is probably to steal a bit from the pointer, making deferredLocks[]
|
|
// an array of Atom<uintptr_t>.
|
|
|
|
template <class WaitContext>
|
|
bool lockSharedImpl(Token* token, WaitContext& ctx) {
|
|
uint32_t state = state_.load(std::memory_order_relaxed);
|
|
if ((state & (kHasS | kMayDefer | kHasE)) == 0 &&
|
|
state_.compare_exchange_strong(state, state + kIncrHasS)) {
|
|
if (token != nullptr) {
|
|
token->type_ = Token::Type::INLINE_SHARED;
|
|
}
|
|
return true;
|
|
}
|
|
return lockSharedImpl(state, token, ctx);
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx);
|
|
|
|
// Updates the state in/out argument as if the locks were made inline,
|
|
// but does not update state_
|
|
void cleanupTokenlessSharedDeferred(uint32_t& state) {
|
|
for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
|
|
auto slotPtr = deferredReader(i);
|
|
auto slotValue = slotPtr->load(std::memory_order_relaxed);
|
|
if (slotValue == tokenlessSlotValue()) {
|
|
slotPtr->store(0, std::memory_order_relaxed);
|
|
state += kIncrHasS;
|
|
if ((state & kHasS) == 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool tryUnlockTokenlessSharedDeferred();
|
|
|
|
bool tryUnlockSharedDeferred(uint32_t slot) {
|
|
assert(slot < kMaxDeferredReaders);
|
|
auto slotValue = tokenfulSlotValue();
|
|
return deferredReader(slot)->compare_exchange_strong(slotValue, 0);
|
|
}
|
|
|
|
uint32_t unlockSharedInline() {
|
|
uint32_t state = (state_ -= kIncrHasS);
|
|
assert((state & (kHasE | kBegunE | kMayDefer)) != 0 ||
|
|
state < state + kIncrHasS);
|
|
if ((state & kHasS) == 0) {
|
|
// Only the second half of lock() can be blocked by a non-zero
|
|
// reader count, so that's the only thing we need to wake
|
|
wakeRegisteredWaiters(state, kWaitingNotS);
|
|
}
|
|
return state;
|
|
}
|
|
|
|
template <class WaitContext>
|
|
bool lockUpgradeImpl(WaitContext& ctx) {
|
|
uint32_t state;
|
|
do {
|
|
if (!waitForZeroBits(state, kHasSolo, kWaitingU, ctx)) {
|
|
return false;
|
|
}
|
|
} while (!state_.compare_exchange_strong(state, state | kHasU));
|
|
return true;
|
|
}
|
|
|
|
public:
|
|
class ReadHolder {
|
|
public:
|
|
ReadHolder() : lock_(nullptr) {}
|
|
|
|
explicit ReadHolder(const SharedMutexImpl* lock) : ReadHolder(*lock) {}
|
|
|
|
explicit ReadHolder(const SharedMutexImpl& lock)
|
|
: lock_(const_cast<SharedMutexImpl*>(&lock)) {
|
|
lock_->lock_shared(token_);
|
|
}
|
|
|
|
ReadHolder(ReadHolder&& rhs) noexcept : lock_(rhs.lock_),
|
|
token_(rhs.token_) {
|
|
rhs.lock_ = nullptr;
|
|
}
|
|
|
|
// Downgrade from upgrade mode
|
|
explicit ReadHolder(UpgradeHolder&& upgraded) : lock_(upgraded.lock_) {
|
|
assert(upgraded.lock_ != nullptr);
|
|
upgraded.lock_ = nullptr;
|
|
lock_->unlock_upgrade_and_lock_shared(token_);
|
|
}
|
|
|
|
// Downgrade from exclusive mode
|
|
explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
|
|
assert(writer.lock_ != nullptr);
|
|
writer.lock_ = nullptr;
|
|
lock_->unlock_and_lock_shared(token_);
|
|
}
|
|
|
|
ReadHolder& operator=(ReadHolder&& rhs) noexcept {
|
|
std::swap(lock_, rhs.lock_);
|
|
std::swap(token_, rhs.token_);
|
|
return *this;
|
|
}
|
|
|
|
ReadHolder(const ReadHolder& rhs) = delete;
|
|
ReadHolder& operator=(const ReadHolder& rhs) = delete;
|
|
|
|
~ReadHolder() {
|
|
unlock();
|
|
}
|
|
|
|
void unlock() {
|
|
if (lock_) {
|
|
lock_->unlock_shared(token_);
|
|
lock_ = nullptr;
|
|
}
|
|
}
|
|
|
|
private:
|
|
friend class UpgradeHolder;
|
|
friend class WriteHolder;
|
|
SharedMutexImpl* lock_;
|
|
SharedMutexToken token_;
|
|
};
|
|
|
|
class UpgradeHolder {
|
|
public:
|
|
UpgradeHolder() : lock_(nullptr) {}
|
|
|
|
explicit UpgradeHolder(SharedMutexImpl* lock) : UpgradeHolder(*lock) {}
|
|
|
|
explicit UpgradeHolder(SharedMutexImpl& lock) : lock_(&lock) {
|
|
lock_->lock_upgrade();
|
|
}
|
|
|
|
// Downgrade from exclusive mode
|
|
explicit UpgradeHolder(WriteHolder&& writer) : lock_(writer.lock_) {
|
|
assert(writer.lock_ != nullptr);
|
|
writer.lock_ = nullptr;
|
|
lock_->unlock_and_lock_upgrade();
|
|
}
|
|
|
|
UpgradeHolder(UpgradeHolder&& rhs) noexcept : lock_(rhs.lock_) {
|
|
rhs.lock_ = nullptr;
|
|
}
|
|
|
|
UpgradeHolder& operator=(UpgradeHolder&& rhs) noexcept {
|
|
std::swap(lock_, rhs.lock_);
|
|
return *this;
|
|
}
|
|
|
|
UpgradeHolder(const UpgradeHolder& rhs) = delete;
|
|
UpgradeHolder& operator=(const UpgradeHolder& rhs) = delete;
|
|
|
|
~UpgradeHolder() {
|
|
unlock();
|
|
}
|
|
|
|
void unlock() {
|
|
if (lock_) {
|
|
lock_->unlock_upgrade();
|
|
lock_ = nullptr;
|
|
}
|
|
}
|
|
|
|
private:
|
|
friend class WriteHolder;
|
|
friend class ReadHolder;
|
|
SharedMutexImpl* lock_;
|
|
};
|
|
|
|
class WriteHolder {
|
|
public:
|
|
WriteHolder() : lock_(nullptr) {}
|
|
|
|
explicit WriteHolder(SharedMutexImpl* lock) : WriteHolder(*lock) {}
|
|
|
|
explicit WriteHolder(SharedMutexImpl& lock) : lock_(&lock) {
|
|
lock_->lock();
|
|
}
|
|
|
|
// Promotion from upgrade mode
|
|
explicit WriteHolder(UpgradeHolder&& upgrade) : lock_(upgrade.lock_) {
|
|
assert(upgrade.lock_ != nullptr);
|
|
upgrade.lock_ = nullptr;
|
|
lock_->unlock_upgrade_and_lock();
|
|
}
|
|
|
|
// README:
|
|
//
|
|
// It is intended that WriteHolder(ReadHolder&& rhs) do not exist.
|
|
//
|
|
// Shared locks (read) can not safely upgrade to unique locks (write).
|
|
// That upgrade path is a well-known recipe for deadlock, so we explicitly
|
|
// disallow it.
|
|
//
|
|
// If you need to do a conditional mutation, you have a few options:
|
|
// 1. Check the condition under a shared lock and release it.
|
|
// Then maybe check the condition again under a unique lock and maybe do
|
|
// the mutation.
|
|
// 2. Check the condition once under an upgradeable lock.
|
|
// Then maybe upgrade the lock to a unique lock and do the mutation.
|
|
// 3. Check the condition and maybe perform the mutation under a unique
|
|
// lock.
|
|
//
|
|
// Relevant upgradeable lock notes:
|
|
// * At most one upgradeable lock can be held at a time for a given shared
|
|
// mutex, just like a unique lock.
|
|
// * An upgradeable lock may be held concurrently with any number of shared
|
|
// locks.
|
|
// * An upgradeable lock may be upgraded atomically to a unique lock.
|
|
|
|
WriteHolder(WriteHolder&& rhs) noexcept : lock_(rhs.lock_) {
|
|
rhs.lock_ = nullptr;
|
|
}
|
|
|
|
WriteHolder& operator=(WriteHolder&& rhs) noexcept {
|
|
std::swap(lock_, rhs.lock_);
|
|
return *this;
|
|
}
|
|
|
|
WriteHolder(const WriteHolder& rhs) = delete;
|
|
WriteHolder& operator=(const WriteHolder& rhs) = delete;
|
|
|
|
~WriteHolder() {
|
|
unlock();
|
|
}
|
|
|
|
void unlock() {
|
|
if (lock_) {
|
|
lock_->unlock();
|
|
lock_ = nullptr;
|
|
}
|
|
}
|
|
|
|
private:
|
|
friend class ReadHolder;
|
|
friend class UpgradeHolder;
|
|
SharedMutexImpl* lock_;
|
|
};
|
|
|
|
// Adapters for Synchronized<>
|
|
friend void acquireRead(SharedMutexImpl& lock) { lock.lock_shared(); }
|
|
friend void acquireReadWrite(SharedMutexImpl& lock) { lock.lock(); }
|
|
friend void releaseRead(SharedMutexImpl& lock) { lock.unlock_shared(); }
|
|
friend void releaseReadWrite(SharedMutexImpl& lock) { lock.unlock(); }
|
|
friend bool acquireRead(SharedMutexImpl& lock, unsigned int ms) {
|
|
return lock.try_lock_shared_for(std::chrono::milliseconds(ms));
|
|
}
|
|
friend bool acquireReadWrite(SharedMutexImpl& lock, unsigned int ms) {
|
|
return lock.try_lock_for(std::chrono::milliseconds(ms));
|
|
}
|
|
};
|
|
|
|
typedef SharedMutexImpl<true> SharedMutexReadPriority;
|
|
typedef SharedMutexImpl<false> SharedMutexWritePriority;
|
|
typedef SharedMutexWritePriority SharedMutex;
|
|
|
|
// Prevent the compiler from instantiating these in other translation units.
|
|
// They are instantiated once in SharedMutex.cpp
|
|
extern template class SharedMutexImpl<true>;
|
|
extern template class SharedMutexImpl<false>;
|
|
|
|
template <
|
|
bool ReaderPriority,
|
|
typename Tag_,
|
|
template <typename> class Atom,
|
|
bool BlockImmediately>
|
|
typename SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
DeferredReaderSlot
|
|
SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
deferredReaders[kMaxDeferredReaders * kDeferredSeparationFactor] =
|
|
{};
|
|
|
|
template <
|
|
bool ReaderPriority,
|
|
typename Tag_,
|
|
template <typename> class Atom,
|
|
bool BlockImmediately>
|
|
FOLLY_SHAREDMUTEX_TLS uint32_t
|
|
SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
tls_lastTokenlessSlot = 0;
|
|
|
|
template <
|
|
bool ReaderPriority,
|
|
typename Tag_,
|
|
template <typename> class Atom,
|
|
bool BlockImmediately>
|
|
FOLLY_SHAREDMUTEX_TLS uint32_t
|
|
SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
tls_lastDeferredReaderSlot = 0;
|
|
|
|
template <
|
|
bool ReaderPriority,
|
|
typename Tag_,
|
|
template <typename> class Atom,
|
|
bool BlockImmediately>
|
|
bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
tryUnlockTokenlessSharedDeferred() {
|
|
auto bestSlot = tls_lastTokenlessSlot;
|
|
for (uint32_t i = 0; i < kMaxDeferredReaders; ++i) {
|
|
auto slotPtr = deferredReader(bestSlot ^ i);
|
|
auto slotValue = slotPtr->load(std::memory_order_relaxed);
|
|
if (slotValue == tokenlessSlotValue() &&
|
|
slotPtr->compare_exchange_strong(slotValue, 0)) {
|
|
tls_lastTokenlessSlot = bestSlot ^ i;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template <
|
|
bool ReaderPriority,
|
|
typename Tag_,
|
|
template <typename> class Atom,
|
|
bool BlockImmediately>
|
|
template <class WaitContext>
|
|
bool SharedMutexImpl<ReaderPriority, Tag_, Atom, BlockImmediately>::
|
|
lockSharedImpl(uint32_t& state, Token* token, WaitContext& ctx) {
|
|
while (true) {
|
|
if (UNLIKELY((state & kHasE) != 0) &&
|
|
!waitForZeroBits(state, kHasE, kWaitingS, ctx) && ctx.canTimeOut()) {
|
|
return false;
|
|
}
|
|
|
|
uint32_t slot = tls_lastDeferredReaderSlot;
|
|
uintptr_t slotValue = 1; // any non-zero value will do
|
|
|
|
bool canAlreadyDefer = (state & kMayDefer) != 0;
|
|
bool aboveDeferThreshold =
|
|
(state & kHasS) >= (kNumSharedToStartDeferring - 1) * kIncrHasS;
|
|
bool drainInProgress = ReaderPriority && (state & kBegunE) != 0;
|
|
if (canAlreadyDefer || (aboveDeferThreshold && !drainInProgress)) {
|
|
/* Try using the most recent slot first. */
|
|
slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
|
|
if (slotValue != 0) {
|
|
// starting point for our empty-slot search, can change after
|
|
// calling waitForZeroBits
|
|
uint32_t bestSlot =
|
|
(uint32_t)folly::detail::AccessSpreader<Atom>::current(
|
|
kMaxDeferredReaders);
|
|
|
|
// deferred readers are already enabled, or it is time to
|
|
// enable them if we can find a slot
|
|
for (uint32_t i = 0; i < kDeferredSearchDistance; ++i) {
|
|
slot = bestSlot ^ i;
|
|
assert(slot < kMaxDeferredReaders);
|
|
slotValue = deferredReader(slot)->load(std::memory_order_relaxed);
|
|
if (slotValue == 0) {
|
|
// found empty slot
|
|
tls_lastDeferredReaderSlot = slot;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (slotValue != 0) {
|
|
// not yet deferred, or no empty slots
|
|
if (state_.compare_exchange_strong(state, state + kIncrHasS)) {
|
|
// successfully recorded the read lock inline
|
|
if (token != nullptr) {
|
|
token->type_ = Token::Type::INLINE_SHARED;
|
|
}
|
|
return true;
|
|
}
|
|
// state is updated, try again
|
|
continue;
|
|
}
|
|
|
|
// record that deferred readers might be in use if necessary
|
|
if ((state & kMayDefer) == 0) {
|
|
if (!state_.compare_exchange_strong(state, state | kMayDefer)) {
|
|
// keep going if CAS failed because somebody else set the bit
|
|
// for us
|
|
if ((state & (kHasE | kMayDefer)) != kMayDefer) {
|
|
continue;
|
|
}
|
|
}
|
|
// state = state | kMayDefer;
|
|
}
|
|
|
|
// try to use the slot
|
|
bool gotSlot = deferredReader(slot)->compare_exchange_strong(
|
|
slotValue,
|
|
token == nullptr ? tokenlessSlotValue() : tokenfulSlotValue());
|
|
|
|
// If we got the slot, we need to verify that an exclusive lock
|
|
// didn't happen since we last checked. If we didn't get the slot we
|
|
// need to recheck state_ anyway to make sure we don't waste too much
|
|
// work. It is also possible that since we checked state_ someone
|
|
// has acquired and released the write lock, clearing kMayDefer.
|
|
// Both cases are covered by looking for the readers-possible bit,
|
|
// because it is off when the exclusive lock bit is set.
|
|
state = state_.load(std::memory_order_acquire);
|
|
|
|
if (!gotSlot) {
|
|
continue;
|
|
}
|
|
|
|
if (token == nullptr) {
|
|
tls_lastTokenlessSlot = slot;
|
|
}
|
|
|
|
if ((state & kMayDefer) != 0) {
|
|
assert((state & kHasE) == 0);
|
|
// success
|
|
if (token != nullptr) {
|
|
token->type_ = Token::Type::DEFERRED_SHARED;
|
|
token->slot_ = (uint16_t)slot;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// release the slot before retrying
|
|
if (token == nullptr) {
|
|
// We can't rely on slot. Token-less slot values can be freed by
|
|
// any unlock_shared(), so we need to do the full deferredReader
|
|
// search during unlock. Unlike unlock_shared(), we can't trust
|
|
// kPrevDefer here. This deferred lock isn't visible to lock()
|
|
// (that's the whole reason we're undoing it) so there might have
|
|
// subsequently been an unlock() and lock() with no intervening
|
|
// transition to deferred mode.
|
|
if (!tryUnlockTokenlessSharedDeferred()) {
|
|
unlockSharedInline();
|
|
}
|
|
} else {
|
|
if (!tryUnlockSharedDeferred(slot)) {
|
|
unlockSharedInline();
|
|
}
|
|
}
|
|
|
|
// We got here not because the lock was unavailable, but because
|
|
// we lost a compare-and-swap. Try-lock is typically allowed to
|
|
// have spurious failures, but there is no lock efficiency gain
|
|
// from exploiting that freedom here.
|
|
}
|
|
}
|
|
|
|
} // namespace folly
|