mirror of
https://github.com/ecency/ecency-mobile.git
synced 2025-01-05 04:28:03 +03:00
769 lines
24 KiB
C
769 lines
24 KiB
C
|
/*
|
||
|
* Copyright 2016 Facebook, Inc.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* N.B. You most likely do _not_ want to use RWSpinLock or any other
|
||
|
* kind of spinlock. Use SharedMutex instead.
|
||
|
*
|
||
|
* In short, spinlocks in preemptive multi-tasking operating systems
|
||
|
* have serious problems and fast mutexes like SharedMutex are almost
|
||
|
* certainly the better choice, because letting the OS scheduler put a
|
||
|
* thread to sleep is better for system responsiveness and throughput
|
||
|
* than wasting a timeslice repeatedly querying a lock held by a
|
||
|
* thread that's blocked, and you can't prevent userspace
|
||
|
* programs blocking.
|
||
|
*
|
||
|
* Spinlocks in an operating system kernel make much more sense than
|
||
|
* they do in userspace.
|
||
|
*
|
||
|
* -------------------------------------------------------------------
|
||
|
*
|
||
|
* Two Read-Write spin lock implementations.
|
||
|
*
|
||
|
* Ref: http://locklessinc.com/articles/locks
|
||
|
*
|
||
|
* Both locks here are faster than pthread_rwlock and have very low
|
||
|
* overhead (usually 20-30ns). They don't use any system mutexes and
|
||
|
* are very compact (4/8 bytes), so are suitable for per-instance
|
||
|
* based locking, particularly when contention is not expected.
|
||
|
*
|
||
|
* For a spinlock, RWSpinLock is a reasonable choice. (See the note
|
||
|
* about for why a spin lock is frequently a bad idea generally.)
|
||
|
* RWSpinLock has minimal overhead, and comparable contention
|
||
|
* performance when the number of competing threads is less than or
|
||
|
* equal to the number of logical CPUs. Even as the number of
|
||
|
* threads gets larger, RWSpinLock can still be very competitive in
|
||
|
* READ, although it is slower on WRITE, and also inherently unfair
|
||
|
* to writers.
|
||
|
*
|
||
|
* RWTicketSpinLock shows more balanced READ/WRITE performance. If
|
||
|
* your application really needs a lot more threads, and a
|
||
|
* higher-priority writer, prefer one of the RWTicketSpinLock locks.
|
||
|
*
|
||
|
* Caveats:
|
||
|
*
|
||
|
* RWTicketSpinLock locks can only be used with GCC on x86/x86-64
|
||
|
* based systems.
|
||
|
*
|
||
|
* RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
|
||
|
* readers and writers.
|
||
|
*
|
||
|
* RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
|
||
|
* readers and writers.
|
||
|
*
|
||
|
* RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
|
||
|
* writer priority) is NOT reentrant, even for lock_shared().
|
||
|
*
|
||
|
* The lock will not grant any new shared (read) accesses while a thread
|
||
|
* attempting to acquire the lock in write mode is blocked. (That is,
|
||
|
* if the lock is held in shared mode by N threads, and a thread attempts
|
||
|
* to acquire it in write mode, no one else can acquire it in shared mode
|
||
|
* until these N threads release the lock and then the blocked thread
|
||
|
* acquires and releases the exclusive lock.) This also applies for
|
||
|
* attempts to reacquire the lock in shared mode by threads that already
|
||
|
* hold it in shared mode, making the lock non-reentrant.
|
||
|
*
|
||
|
* RWSpinLock handles 2^30 - 1 concurrent readers.
|
||
|
*
|
||
|
* @author Xin Liu <xliux@fb.com>
|
||
|
*/
|
||
|
|
||
|
#pragma once
|
||
|
|
||
|
/*
|
||
|
========================================================================
|
||
|
Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
|
||
|
========================================================================
|
||
|
|
||
|
------------------------------------------------------------------------------
|
||
|
1. Single thread benchmark (read/write lock + unlock overhead)
|
||
|
Benchmark Iters Total t t/iter iter/sec
|
||
|
-------------------------------------------------------------------------------
|
||
|
* BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
|
||
|
+30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
|
||
|
+85.7% BM_RWTicketSpinLock32Read 100000 3.317 ms 33.17 ns 28.75M
|
||
|
+96.0% BM_RWTicketSpinLock32Write 100000 3.5 ms 35 ns 27.25M
|
||
|
+85.6% BM_RWTicketSpinLock64Read 100000 3.315 ms 33.15 ns 28.77M
|
||
|
+96.0% BM_RWTicketSpinLock64Write 100000 3.5 ms 35 ns 27.25M
|
||
|
+85.7% BM_RWTicketSpinLock32FavorWriterRead 100000 3.317 ms 33.17 ns 28.75M
|
||
|
+29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000 2.316 ms 23.16 ns 41.18M
|
||
|
+85.3% BM_RWTicketSpinLock64FavorWriterRead 100000 3.309 ms 33.09 ns 28.82M
|
||
|
+30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000 2.325 ms 23.25 ns 41.02M
|
||
|
+ 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
|
||
|
+ 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
|
||
|
|
||
|
------------------------------------------------------------------------------
|
||
|
2. Contention Benchmark 90% read 10% write
|
||
|
Benchmark hits average min max sigma
|
||
|
------------------------------------------------------------------------------
|
||
|
---------- 8 threads ------------
|
||
|
RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
|
||
|
RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
|
||
|
RWTicketSpinLock Write 85692 209ns 71ns 17.9us 252ns
|
||
|
RWTicketSpinLock Read 769571 215ns 78ns 33.4us 251ns
|
||
|
pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
|
||
|
pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
|
||
|
|
||
|
---------- 16 threads ------------
|
||
|
RWSpinLock Write 124236 237ns 78ns 261us 801ns
|
||
|
RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
|
||
|
RWTicketSpinLock Write 81781 231ns 71ns 31.4us 351ns
|
||
|
RWTicketSpinLock Read 734518 238ns 78ns 73.6us 379ns
|
||
|
pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
|
||
|
pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
|
||
|
|
||
|
---------- 50 threads ------------
|
||
|
RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
|
||
|
RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
|
||
|
RWTicketSpinLock Write 83045 397ns 73ns 7.01ms 31.5us
|
||
|
RWTicketSpinLock Read 744133 386ns 78ns 11ms 31.4us
|
||
|
pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
|
||
|
pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
|
||
|
|
||
|
*/
|
||
|
|
||
|
#include <folly/Portability.h>
|
||
|
#include <folly/portability/Asm.h>
|
||
|
|
||
|
#if defined(__GNUC__) && \
|
||
|
(defined(__i386) || FOLLY_X64 || \
|
||
|
defined(ARCH_K8))
|
||
|
# define RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
# include <x86intrin.h>
|
||
|
#elif defined(_MSC_VER) && defined(FOLLY_X64)
|
||
|
# define RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
#else
|
||
|
# undef RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
#endif
|
||
|
|
||
|
// iOS doesn't define _mm_cvtsi64_si128 and friends
|
||
|
#if (FOLLY_SSE >= 2) && !FOLLY_MOBILE
|
||
|
#define RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
#else
|
||
|
#undef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
#endif
|
||
|
|
||
|
#include <atomic>
|
||
|
#include <string>
|
||
|
#include <algorithm>
|
||
|
|
||
|
#include <sched.h>
|
||
|
#include <glog/logging.h>
|
||
|
|
||
|
#include <folly/Likely.h>
|
||
|
|
||
|
namespace folly {
|
||
|
|
||
|
/*
|
||
|
* A simple, small (4-bytes), but unfair rwlock. Use it when you want
|
||
|
* a nice writer and don't expect a lot of write/read contention, or
|
||
|
* when you need small rwlocks since you are creating a large number
|
||
|
* of them.
|
||
|
*
|
||
|
* Note that the unfairness here is extreme: if the lock is
|
||
|
* continually accessed for read, writers will never get a chance. If
|
||
|
* the lock can be that highly contended this class is probably not an
|
||
|
* ideal choice anyway.
|
||
|
*
|
||
|
* It currently implements most of the Lockable, SharedLockable and
|
||
|
* UpgradeLockable concepts except the TimedLockable related locking/unlocking
|
||
|
* interfaces.
|
||
|
*/
|
||
|
class RWSpinLock {
|
||
|
enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
|
||
|
public:
|
||
|
constexpr RWSpinLock() : bits_(0) {}
|
||
|
|
||
|
RWSpinLock(RWSpinLock const&) = delete;
|
||
|
RWSpinLock& operator=(RWSpinLock const&) = delete;
|
||
|
|
||
|
// Lockable Concept
|
||
|
void lock() {
|
||
|
int count = 0;
|
||
|
while (!LIKELY(try_lock())) {
|
||
|
if (++count > 1000) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Writer is responsible for clearing up both the UPGRADED and WRITER bits.
|
||
|
void unlock() {
|
||
|
static_assert(READER > WRITER + UPGRADED, "wrong bits!");
|
||
|
bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
|
||
|
}
|
||
|
|
||
|
// SharedLockable Concept
|
||
|
void lock_shared() {
|
||
|
int count = 0;
|
||
|
while (!LIKELY(try_lock_shared())) {
|
||
|
if (++count > 1000) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void unlock_shared() {
|
||
|
bits_.fetch_add(-READER, std::memory_order_release);
|
||
|
}
|
||
|
|
||
|
// Downgrade the lock from writer status to reader status.
|
||
|
void unlock_and_lock_shared() {
|
||
|
bits_.fetch_add(READER, std::memory_order_acquire);
|
||
|
unlock();
|
||
|
}
|
||
|
|
||
|
// UpgradeLockable Concept
|
||
|
void lock_upgrade() {
|
||
|
int count = 0;
|
||
|
while (!try_lock_upgrade()) {
|
||
|
if (++count > 1000) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void unlock_upgrade() {
|
||
|
bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
|
||
|
}
|
||
|
|
||
|
// unlock upgrade and try to acquire write lock
|
||
|
void unlock_upgrade_and_lock() {
|
||
|
int64_t count = 0;
|
||
|
while (!try_unlock_upgrade_and_lock()) {
|
||
|
if (++count > 1000) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// unlock upgrade and read lock atomically
|
||
|
void unlock_upgrade_and_lock_shared() {
|
||
|
bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
|
||
|
}
|
||
|
|
||
|
// write unlock and upgrade lock atomically
|
||
|
void unlock_and_lock_upgrade() {
|
||
|
// need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
|
||
|
// the same time when other threads are trying do try_lock_upgrade().
|
||
|
bits_.fetch_or(UPGRADED, std::memory_order_acquire);
|
||
|
bits_.fetch_add(-WRITER, std::memory_order_release);
|
||
|
}
|
||
|
|
||
|
|
||
|
// Attempt to acquire writer permission. Return false if we didn't get it.
|
||
|
bool try_lock() {
|
||
|
int32_t expect = 0;
|
||
|
return bits_.compare_exchange_strong(expect, WRITER,
|
||
|
std::memory_order_acq_rel);
|
||
|
}
|
||
|
|
||
|
// Try to get reader permission on the lock. This can fail if we
|
||
|
// find out someone is a writer or upgrader.
|
||
|
// Setting the UPGRADED bit would allow a writer-to-be to indicate
|
||
|
// its intention to write and block any new readers while waiting
|
||
|
// for existing readers to finish and release their read locks. This
|
||
|
// helps avoid starving writers (promoted from upgraders).
|
||
|
bool try_lock_shared() {
|
||
|
// fetch_add is considerably (100%) faster than compare_exchange,
|
||
|
// so here we are optimizing for the common (lock success) case.
|
||
|
int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
|
||
|
if (UNLIKELY(value & (WRITER|UPGRADED))) {
|
||
|
bits_.fetch_add(-READER, std::memory_order_release);
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// try to unlock upgrade and write lock atomically
|
||
|
bool try_unlock_upgrade_and_lock() {
|
||
|
int32_t expect = UPGRADED;
|
||
|
return bits_.compare_exchange_strong(expect, WRITER,
|
||
|
std::memory_order_acq_rel);
|
||
|
}
|
||
|
|
||
|
// try to acquire an upgradable lock.
|
||
|
bool try_lock_upgrade() {
|
||
|
int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
|
||
|
|
||
|
// Note: when failed, we cannot flip the UPGRADED bit back,
|
||
|
// as in this case there is either another upgrade lock or a write lock.
|
||
|
// If it's a write lock, the bit will get cleared up when that lock's done
|
||
|
// with unlock().
|
||
|
return ((value & (UPGRADED | WRITER)) == 0);
|
||
|
}
|
||
|
|
||
|
// mainly for debugging purposes.
|
||
|
int32_t bits() const { return bits_.load(std::memory_order_acquire); }
|
||
|
|
||
|
class ReadHolder;
|
||
|
class UpgradedHolder;
|
||
|
class WriteHolder;
|
||
|
|
||
|
class ReadHolder {
|
||
|
public:
|
||
|
explicit ReadHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
|
||
|
if (lock_) lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
explicit ReadHolder(RWSpinLock& lock) : lock_(&lock) {
|
||
|
lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
ReadHolder(ReadHolder&& other) noexcept : lock_(other.lock_) {
|
||
|
other.lock_ = nullptr;
|
||
|
}
|
||
|
|
||
|
// down-grade
|
||
|
explicit ReadHolder(UpgradedHolder&& upgraded) : lock_(upgraded.lock_) {
|
||
|
upgraded.lock_ = nullptr;
|
||
|
if (lock_) lock_->unlock_upgrade_and_lock_shared();
|
||
|
}
|
||
|
|
||
|
explicit ReadHolder(WriteHolder&& writer) : lock_(writer.lock_) {
|
||
|
writer.lock_ = nullptr;
|
||
|
if (lock_) lock_->unlock_and_lock_shared();
|
||
|
}
|
||
|
|
||
|
ReadHolder& operator=(ReadHolder&& other) {
|
||
|
using std::swap;
|
||
|
swap(lock_, other.lock_);
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
ReadHolder(const ReadHolder& other) = delete;
|
||
|
ReadHolder& operator=(const ReadHolder& other) = delete;
|
||
|
|
||
|
~ReadHolder() { if (lock_) lock_->unlock_shared(); }
|
||
|
|
||
|
void reset(RWSpinLock* lock = nullptr) {
|
||
|
if (lock == lock_) return;
|
||
|
if (lock_) lock_->unlock_shared();
|
||
|
lock_ = lock;
|
||
|
if (lock_) lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
void swap(ReadHolder* other) {
|
||
|
std::swap(lock_, other->lock_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
friend class UpgradedHolder;
|
||
|
friend class WriteHolder;
|
||
|
RWSpinLock* lock_;
|
||
|
};
|
||
|
|
||
|
class UpgradedHolder {
|
||
|
public:
|
||
|
explicit UpgradedHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
|
||
|
if (lock_) lock_->lock_upgrade();
|
||
|
}
|
||
|
|
||
|
explicit UpgradedHolder(RWSpinLock& lock) : lock_(&lock) {
|
||
|
lock_->lock_upgrade();
|
||
|
}
|
||
|
|
||
|
explicit UpgradedHolder(WriteHolder&& writer) {
|
||
|
lock_ = writer.lock_;
|
||
|
writer.lock_ = nullptr;
|
||
|
if (lock_) lock_->unlock_and_lock_upgrade();
|
||
|
}
|
||
|
|
||
|
UpgradedHolder(UpgradedHolder&& other) noexcept : lock_(other.lock_) {
|
||
|
other.lock_ = nullptr;
|
||
|
}
|
||
|
|
||
|
UpgradedHolder& operator =(UpgradedHolder&& other) {
|
||
|
using std::swap;
|
||
|
swap(lock_, other.lock_);
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
UpgradedHolder(const UpgradedHolder& other) = delete;
|
||
|
UpgradedHolder& operator =(const UpgradedHolder& other) = delete;
|
||
|
|
||
|
~UpgradedHolder() { if (lock_) lock_->unlock_upgrade(); }
|
||
|
|
||
|
void reset(RWSpinLock* lock = nullptr) {
|
||
|
if (lock == lock_) return;
|
||
|
if (lock_) lock_->unlock_upgrade();
|
||
|
lock_ = lock;
|
||
|
if (lock_) lock_->lock_upgrade();
|
||
|
}
|
||
|
|
||
|
void swap(UpgradedHolder* other) {
|
||
|
using std::swap;
|
||
|
swap(lock_, other->lock_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
friend class WriteHolder;
|
||
|
friend class ReadHolder;
|
||
|
RWSpinLock* lock_;
|
||
|
};
|
||
|
|
||
|
class WriteHolder {
|
||
|
public:
|
||
|
explicit WriteHolder(RWSpinLock* lock = nullptr) : lock_(lock) {
|
||
|
if (lock_) lock_->lock();
|
||
|
}
|
||
|
|
||
|
explicit WriteHolder(RWSpinLock& lock) : lock_(&lock) {
|
||
|
lock_->lock();
|
||
|
}
|
||
|
|
||
|
// promoted from an upgrade lock holder
|
||
|
explicit WriteHolder(UpgradedHolder&& upgraded) {
|
||
|
lock_ = upgraded.lock_;
|
||
|
upgraded.lock_ = nullptr;
|
||
|
if (lock_) lock_->unlock_upgrade_and_lock();
|
||
|
}
|
||
|
|
||
|
WriteHolder(WriteHolder&& other) noexcept : lock_(other.lock_) {
|
||
|
other.lock_ = nullptr;
|
||
|
}
|
||
|
|
||
|
WriteHolder& operator =(WriteHolder&& other) {
|
||
|
using std::swap;
|
||
|
swap(lock_, other.lock_);
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
WriteHolder(const WriteHolder& other) = delete;
|
||
|
WriteHolder& operator =(const WriteHolder& other) = delete;
|
||
|
|
||
|
~WriteHolder () { if (lock_) lock_->unlock(); }
|
||
|
|
||
|
void reset(RWSpinLock* lock = nullptr) {
|
||
|
if (lock == lock_) return;
|
||
|
if (lock_) lock_->unlock();
|
||
|
lock_ = lock;
|
||
|
if (lock_) lock_->lock();
|
||
|
}
|
||
|
|
||
|
void swap(WriteHolder* other) {
|
||
|
using std::swap;
|
||
|
swap(lock_, other->lock_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
friend class ReadHolder;
|
||
|
friend class UpgradedHolder;
|
||
|
RWSpinLock* lock_;
|
||
|
};
|
||
|
|
||
|
private:
|
||
|
std::atomic<int32_t> bits_;
|
||
|
};
|
||
|
|
||
|
|
||
|
#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
// A more balanced Read-Write spin lock implemented based on GCC intrinsics.
|
||
|
|
||
|
namespace detail {
|
||
|
template <size_t kBitWidth> struct RWTicketIntTrait {
|
||
|
static_assert(kBitWidth == 32 || kBitWidth == 64,
|
||
|
"bit width has to be either 32 or 64 ");
|
||
|
};
|
||
|
|
||
|
template <>
|
||
|
struct RWTicketIntTrait<64> {
|
||
|
typedef uint64_t FullInt;
|
||
|
typedef uint32_t HalfInt;
|
||
|
typedef uint16_t QuarterInt;
|
||
|
|
||
|
#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
static __m128i make128(const uint16_t v[4]) {
|
||
|
return _mm_set_epi16(0, 0, 0, 0, v[3], v[2], v[1], v[0]);
|
||
|
}
|
||
|
static inline __m128i fromInteger(uint64_t from) {
|
||
|
return _mm_cvtsi64_si128(from);
|
||
|
}
|
||
|
static inline uint64_t toInteger(__m128i in) {
|
||
|
return _mm_cvtsi128_si64(in);
|
||
|
}
|
||
|
static inline uint64_t addParallel(__m128i in, __m128i kDelta) {
|
||
|
return toInteger(_mm_add_epi16(in, kDelta));
|
||
|
}
|
||
|
#endif
|
||
|
};
|
||
|
|
||
|
template <>
|
||
|
struct RWTicketIntTrait<32> {
|
||
|
typedef uint32_t FullInt;
|
||
|
typedef uint16_t HalfInt;
|
||
|
typedef uint8_t QuarterInt;
|
||
|
|
||
|
#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
static __m128i make128(const uint8_t v[4]) {
|
||
|
return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, v[3], v[2], v[1], v[0]);
|
||
|
}
|
||
|
static inline __m128i fromInteger(uint32_t from) {
|
||
|
return _mm_cvtsi32_si128(from);
|
||
|
}
|
||
|
static inline uint32_t toInteger(__m128i in) {
|
||
|
return _mm_cvtsi128_si32(in);
|
||
|
}
|
||
|
static inline uint32_t addParallel(__m128i in, __m128i kDelta) {
|
||
|
return toInteger(_mm_add_epi8(in, kDelta));
|
||
|
}
|
||
|
#endif
|
||
|
};
|
||
|
} // detail
|
||
|
|
||
|
|
||
|
template<size_t kBitWidth, bool kFavorWriter=false>
|
||
|
class RWTicketSpinLockT {
|
||
|
typedef detail::RWTicketIntTrait<kBitWidth> IntTraitType;
|
||
|
typedef typename detail::RWTicketIntTrait<kBitWidth>::FullInt FullInt;
|
||
|
typedef typename detail::RWTicketIntTrait<kBitWidth>::HalfInt HalfInt;
|
||
|
typedef typename detail::RWTicketIntTrait<kBitWidth>::QuarterInt
|
||
|
QuarterInt;
|
||
|
|
||
|
union RWTicket {
|
||
|
constexpr RWTicket() : whole(0) {}
|
||
|
FullInt whole;
|
||
|
HalfInt readWrite;
|
||
|
__extension__ struct {
|
||
|
QuarterInt write;
|
||
|
QuarterInt read;
|
||
|
QuarterInt users;
|
||
|
};
|
||
|
} ticket;
|
||
|
|
||
|
private: // Some x64-specific utilities for atomic access to ticket.
|
||
|
template<class T> static T load_acquire(T* addr) {
|
||
|
T t = *addr; // acquire barrier
|
||
|
asm_volatile_memory();
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
template<class T>
|
||
|
static void store_release(T* addr, T v) {
|
||
|
asm_volatile_memory();
|
||
|
*addr = v; // release barrier
|
||
|
}
|
||
|
|
||
|
public:
|
||
|
|
||
|
constexpr RWTicketSpinLockT() {}
|
||
|
|
||
|
RWTicketSpinLockT(RWTicketSpinLockT const&) = delete;
|
||
|
RWTicketSpinLockT& operator=(RWTicketSpinLockT const&) = delete;
|
||
|
|
||
|
void lock() {
|
||
|
if (kFavorWriter) {
|
||
|
writeLockAggressive();
|
||
|
} else {
|
||
|
writeLockNice();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Both try_lock and try_lock_shared diverge in our implementation from the
|
||
|
* lock algorithm described in the link above.
|
||
|
*
|
||
|
* In the read case, it is undesirable that the readers could wait
|
||
|
* for another reader (before increasing ticket.read in the other
|
||
|
* implementation). Our approach gives up on
|
||
|
* first-come-first-serve, but our benchmarks showed improve
|
||
|
* performance for both readers and writers under heavily contended
|
||
|
* cases, particularly when the number of threads exceeds the number
|
||
|
* of logical CPUs.
|
||
|
*
|
||
|
* We have writeLockAggressive() using the original implementation
|
||
|
* for a writer, which gives some advantage to the writer over the
|
||
|
* readers---for that path it is guaranteed that the writer will
|
||
|
* acquire the lock after all the existing readers exit.
|
||
|
*/
|
||
|
bool try_lock() {
|
||
|
RWTicket t;
|
||
|
FullInt old = t.whole = load_acquire(&ticket.whole);
|
||
|
if (t.users != t.write) return false;
|
||
|
++t.users;
|
||
|
return __sync_bool_compare_and_swap(&ticket.whole, old, t.whole);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Call this if you want to prioritize writer to avoid starvation.
|
||
|
* Unlike writeLockNice, immediately acquires the write lock when
|
||
|
* the existing readers (arriving before the writer) finish their
|
||
|
* turns.
|
||
|
*/
|
||
|
void writeLockAggressive() {
|
||
|
// sched_yield() is needed here to avoid a pathology if the number
|
||
|
// of threads attempting concurrent writes is >= the number of real
|
||
|
// cores allocated to this process. This is less likely than the
|
||
|
// corresponding situation in lock_shared(), but we still want to
|
||
|
// avoid it
|
||
|
int count = 0;
|
||
|
QuarterInt val = __sync_fetch_and_add(&ticket.users, 1);
|
||
|
while (val != load_acquire(&ticket.write)) {
|
||
|
asm_volatile_pause();
|
||
|
if (UNLIKELY(++count > 1000)) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Call this when the writer should be nicer to the readers.
|
||
|
void writeLockNice() {
|
||
|
// Here it doesn't cpu-relax the writer.
|
||
|
//
|
||
|
// This is because usually we have many more readers than the
|
||
|
// writers, so the writer has less chance to get the lock when
|
||
|
// there are a lot of competing readers. The aggressive spinning
|
||
|
// can help to avoid starving writers.
|
||
|
//
|
||
|
// We don't worry about sched_yield() here because the caller
|
||
|
// has already explicitly abandoned fairness.
|
||
|
while (!try_lock()) {}
|
||
|
}
|
||
|
|
||
|
// Atomically unlock the write-lock from writer and acquire the read-lock.
|
||
|
void unlock_and_lock_shared() {
|
||
|
QuarterInt val = __sync_fetch_and_add(&ticket.read, 1);
|
||
|
}
|
||
|
|
||
|
// Release writer permission on the lock.
|
||
|
void unlock() {
|
||
|
RWTicket t;
|
||
|
t.whole = load_acquire(&ticket.whole);
|
||
|
FullInt old = t.whole;
|
||
|
|
||
|
#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
// SSE2 can reduce the lock and unlock overhead by 10%
|
||
|
static const QuarterInt kDeltaBuf[4] = { 1, 1, 0, 0 }; // write/read/user
|
||
|
static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
|
||
|
__m128i m = IntTraitType::fromInteger(old);
|
||
|
t.whole = IntTraitType::addParallel(m, kDelta);
|
||
|
#else
|
||
|
++t.read;
|
||
|
++t.write;
|
||
|
#endif
|
||
|
store_release(&ticket.readWrite, t.readWrite);
|
||
|
}
|
||
|
|
||
|
void lock_shared() {
|
||
|
// sched_yield() is important here because we can't grab the
|
||
|
// shared lock if there is a pending writeLockAggressive, so we
|
||
|
// need to let threads that already have a shared lock complete
|
||
|
int count = 0;
|
||
|
while (!LIKELY(try_lock_shared())) {
|
||
|
asm_volatile_pause();
|
||
|
if (UNLIKELY((++count & 1023) == 0)) sched_yield();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool try_lock_shared() {
|
||
|
RWTicket t, old;
|
||
|
old.whole = t.whole = load_acquire(&ticket.whole);
|
||
|
old.users = old.read;
|
||
|
#ifdef RW_SPINLOCK_USE_SSE_INSTRUCTIONS_
|
||
|
// SSE2 may reduce the total lock and unlock overhead by 10%
|
||
|
static const QuarterInt kDeltaBuf[4] = { 0, 1, 1, 0 }; // write/read/user
|
||
|
static const __m128i kDelta = IntTraitType::make128(kDeltaBuf);
|
||
|
__m128i m = IntTraitType::fromInteger(old.whole);
|
||
|
t.whole = IntTraitType::addParallel(m, kDelta);
|
||
|
#else
|
||
|
++t.read;
|
||
|
++t.users;
|
||
|
#endif
|
||
|
return __sync_bool_compare_and_swap(&ticket.whole, old.whole, t.whole);
|
||
|
}
|
||
|
|
||
|
void unlock_shared() {
|
||
|
QuarterInt val = __sync_fetch_and_add(&ticket.write, 1);
|
||
|
}
|
||
|
|
||
|
class WriteHolder;
|
||
|
|
||
|
typedef RWTicketSpinLockT<kBitWidth, kFavorWriter> RWSpinLock;
|
||
|
class ReadHolder {
|
||
|
public:
|
||
|
ReadHolder(ReadHolder const&) = delete;
|
||
|
ReadHolder& operator=(ReadHolder const&) = delete;
|
||
|
|
||
|
explicit ReadHolder(RWSpinLock *lock = nullptr) :
|
||
|
lock_(lock) {
|
||
|
if (lock_) lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
explicit ReadHolder(RWSpinLock &lock) : lock_ (&lock) {
|
||
|
if (lock_) lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
// atomically unlock the write-lock from writer and acquire the read-lock
|
||
|
explicit ReadHolder(WriteHolder *writer) : lock_(nullptr) {
|
||
|
std::swap(this->lock_, writer->lock_);
|
||
|
if (lock_) {
|
||
|
lock_->unlock_and_lock_shared();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
~ReadHolder() {
|
||
|
if (lock_) lock_->unlock_shared();
|
||
|
}
|
||
|
|
||
|
void reset(RWSpinLock *lock = nullptr) {
|
||
|
if (lock_) lock_->unlock_shared();
|
||
|
lock_ = lock;
|
||
|
if (lock_) lock_->lock_shared();
|
||
|
}
|
||
|
|
||
|
void swap(ReadHolder *other) {
|
||
|
std::swap(this->lock_, other->lock_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
RWSpinLock *lock_;
|
||
|
};
|
||
|
|
||
|
class WriteHolder {
|
||
|
public:
|
||
|
WriteHolder(WriteHolder const&) = delete;
|
||
|
WriteHolder& operator=(WriteHolder const&) = delete;
|
||
|
|
||
|
explicit WriteHolder(RWSpinLock *lock = nullptr) : lock_(lock) {
|
||
|
if (lock_) lock_->lock();
|
||
|
}
|
||
|
explicit WriteHolder(RWSpinLock &lock) : lock_ (&lock) {
|
||
|
if (lock_) lock_->lock();
|
||
|
}
|
||
|
|
||
|
~WriteHolder() {
|
||
|
if (lock_) lock_->unlock();
|
||
|
}
|
||
|
|
||
|
void reset(RWSpinLock *lock = nullptr) {
|
||
|
if (lock == lock_) return;
|
||
|
if (lock_) lock_->unlock();
|
||
|
lock_ = lock;
|
||
|
if (lock_) lock_->lock();
|
||
|
}
|
||
|
|
||
|
void swap(WriteHolder *other) {
|
||
|
std::swap(this->lock_, other->lock_);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
friend class ReadHolder;
|
||
|
RWSpinLock *lock_;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
typedef RWTicketSpinLockT<32> RWTicketSpinLock32;
|
||
|
typedef RWTicketSpinLockT<64> RWTicketSpinLock64;
|
||
|
|
||
|
#endif // RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
|
||
|
} // namespace folly
|
||
|
|
||
|
#ifdef RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
#undef RW_SPINLOCK_USE_X86_INTRINSIC_
|
||
|
#endif
|