235 lines
8.0 KiB
C
235 lines
8.0 KiB
C
|
/*
|
||
|
* Copyright 2016-present Facebook, Inc.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#pragma once
|
||
|
|
||
|
#include <cassert>
|
||
|
#include <climits>
|
||
|
#include <cstdint>
|
||
|
|
||
|
#include <folly/Portability.h>
|
||
|
#include <folly/detail/Futex.h>
|
||
|
|
||
|
#if defined(__clang__)
|
||
|
#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
|
||
|
#else
|
||
|
#define NO_SANITIZE_ADDRESS
|
||
|
#endif
|
||
|
|
||
|
namespace folly {
|
||
|
|
||
|
/**
|
||
|
* Tiny exclusive lock that packs four lock slots into a single
|
||
|
* byte. Each slot is an independent real, sleeping lock. The default
|
||
|
* lock and unlock functions operate on slot zero, which modifies only
|
||
|
* the low two bits of the host byte.
|
||
|
*
|
||
|
* You should zero-initialize the bits of a MicroLock that you intend
|
||
|
* to use.
|
||
|
*
|
||
|
* If you're not space-constrained, prefer std::mutex, which will
|
||
|
* likely be faster, since it has more than two bits of information to
|
||
|
* work with.
|
||
|
*
|
||
|
* You are free to put a MicroLock in a union with some other object.
|
||
|
* If, for example, you want to use the bottom two bits of a pointer
|
||
|
* as a lock, you can put a MicroLock in a union with the pointer and
|
||
|
* limit yourself to MicroLock slot zero, which will use the two
|
||
|
* least-significant bits in the bottom byte.
|
||
|
*
|
||
|
* (Note that such a union is safe only because MicroLock is based on
|
||
|
* a character type, and even under a strict interpretation of C++'s
|
||
|
* aliasing rules, character types may alias anything.)
|
||
|
*
|
||
|
* MicroLock uses a dirty trick: it actually operates on the full
|
||
|
* 32-bit, four-byte-aligned bit of memory into which it is embedded.
|
||
|
* It never modifies bits outside the ones it's defined to modify, but
|
||
|
* it _accesses_ all the bits in the 32-bit memory location for
|
||
|
* purposes of futex management.
|
||
|
*
|
||
|
* The MaxSpins template parameter controls the number of times we
|
||
|
* spin trying to acquire the lock. MaxYields controls the number of
|
||
|
* times we call sched_yield; once we've tried to acquire the lock
|
||
|
* MaxSpins + MaxYields times, we sleep on the lock futex.
|
||
|
* By adjusting these parameters, you can make MicroLock behave as
|
||
|
* much or as little like a conventional spinlock as you'd like.
|
||
|
*
|
||
|
* Performance
|
||
|
* -----------
|
||
|
*
|
||
|
* With the default template options, the timings for uncontended
|
||
|
* acquire-then-release come out as follows on Intel(R) Xeon(R) CPU
|
||
|
* E5-2660 0 @ 2.20GHz, in @mode/opt, as of the master tree at Tue, 01
|
||
|
* Mar 2016 19:48:15.
|
||
|
*
|
||
|
* ========================================================================
|
||
|
* folly/test/SmallLocksBenchmark.cpp relative time/iter iters/s
|
||
|
* ========================================================================
|
||
|
* MicroSpinLockUncontendedBenchmark 13.46ns 74.28M
|
||
|
* PicoSpinLockUncontendedBenchmark 14.99ns 66.71M
|
||
|
* MicroLockUncontendedBenchmark 27.06ns 36.96M
|
||
|
* StdMutexUncontendedBenchmark 25.18ns 39.72M
|
||
|
* VirtualFunctionCall 1.72ns 579.78M
|
||
|
* ========================================================================
|
||
|
*
|
||
|
* (The virtual dispatch benchmark is provided for scale.)
|
||
|
*
|
||
|
* While the uncontended case for MicroLock is competitive with the
|
||
|
* glibc 2.2.0 implementation of std::mutex, std::mutex is likely to be
|
||
|
* faster in the contended case, because we need to wake up all waiters
|
||
|
* when we release.
|
||
|
*
|
||
|
* Make sure to benchmark your particular workload.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
class MicroLockCore {
|
||
|
protected:
|
||
|
#if defined(__SANITIZE_ADDRESS__) && !defined(__clang__) && \
|
||
|
(defined(__GNUC__) || defined(__GNUG__))
|
||
|
uint32_t lock_;
|
||
|
#else
|
||
|
uint8_t lock_;
|
||
|
#endif
|
||
|
inline detail::Futex<>* word() const; // Well, halfword on 64-bit systems
|
||
|
inline uint32_t baseShift(unsigned slot) const;
|
||
|
inline uint32_t heldBit(unsigned slot) const;
|
||
|
inline uint32_t waitBit(unsigned slot) const;
|
||
|
static void lockSlowPath(
|
||
|
uint32_t oldWord,
|
||
|
detail::Futex<>* wordPtr,
|
||
|
uint32_t slotHeldBit,
|
||
|
unsigned maxSpins,
|
||
|
unsigned maxYields);
|
||
|
|
||
|
public:
|
||
|
inline void unlock(unsigned slot) NO_SANITIZE_ADDRESS;
|
||
|
inline void unlock() {
|
||
|
unlock(0);
|
||
|
}
|
||
|
// Initializes all the slots.
|
||
|
inline void init() {
|
||
|
lock_ = 0;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
inline detail::Futex<>* MicroLockCore::word() const {
|
||
|
uintptr_t lockptr = (uintptr_t)&lock_;
|
||
|
lockptr &= ~(sizeof(uint32_t) - 1);
|
||
|
return (detail::Futex<>*)lockptr;
|
||
|
}
|
||
|
|
||
|
inline unsigned MicroLockCore::baseShift(unsigned slot) const {
|
||
|
assert(slot < CHAR_BIT / 2);
|
||
|
|
||
|
unsigned offset_bytes = (unsigned)((uintptr_t)&lock_ - (uintptr_t)word());
|
||
|
|
||
|
return (
|
||
|
unsigned)(kIsLittleEndian ? offset_bytes * CHAR_BIT + slot * 2 : CHAR_BIT * (sizeof(uint32_t) - offset_bytes - 1) + slot * 2);
|
||
|
}
|
||
|
|
||
|
inline uint32_t MicroLockCore::heldBit(unsigned slot) const {
|
||
|
return 1U << (baseShift(slot) + 0);
|
||
|
}
|
||
|
|
||
|
inline uint32_t MicroLockCore::waitBit(unsigned slot) const {
|
||
|
return 1U << (baseShift(slot) + 1);
|
||
|
}
|
||
|
|
||
|
void MicroLockCore::unlock(unsigned slot) {
|
||
|
detail::Futex<>* wordPtr = word();
|
||
|
uint32_t oldWord;
|
||
|
uint32_t newWord;
|
||
|
|
||
|
oldWord = wordPtr->load(std::memory_order_relaxed);
|
||
|
do {
|
||
|
assert(oldWord & heldBit(slot));
|
||
|
newWord = oldWord & ~(heldBit(slot) | waitBit(slot));
|
||
|
} while (!wordPtr->compare_exchange_weak(
|
||
|
oldWord, newWord, std::memory_order_release, std::memory_order_relaxed));
|
||
|
|
||
|
if (oldWord & waitBit(slot)) {
|
||
|
detail::futexWake(wordPtr, 1, heldBit(slot));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <unsigned MaxSpins = 1000, unsigned MaxYields = 0>
|
||
|
class MicroLockBase : public MicroLockCore {
|
||
|
public:
|
||
|
inline void lock(unsigned slot) NO_SANITIZE_ADDRESS;
|
||
|
inline void lock() {
|
||
|
lock(0);
|
||
|
}
|
||
|
inline bool try_lock(unsigned slot) NO_SANITIZE_ADDRESS;
|
||
|
inline bool try_lock() {
|
||
|
return try_lock(0);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
template <unsigned MaxSpins, unsigned MaxYields>
|
||
|
bool MicroLockBase<MaxSpins, MaxYields>::try_lock(unsigned slot) {
|
||
|
// N.B. You might think that try_lock is just the fast path of lock,
|
||
|
// but you'd be wrong. Keep in mind that other parts of our host
|
||
|
// word might be changing while we take the lock! We're not allowed
|
||
|
// to fail spuriously if the lock is in fact not held, even if other
|
||
|
// people are concurrently modifying other parts of the word.
|
||
|
//
|
||
|
// We need to loop until we either see firm evidence that somebody
|
||
|
// else has the lock (by looking at heldBit) or see our CAS succeed.
|
||
|
// A failed CAS by itself does not indicate lock-acquire failure.
|
||
|
|
||
|
detail::Futex<>* wordPtr = word();
|
||
|
uint32_t oldWord = wordPtr->load(std::memory_order_relaxed);
|
||
|
do {
|
||
|
if (oldWord & heldBit(slot)) {
|
||
|
return false;
|
||
|
}
|
||
|
} while (!wordPtr->compare_exchange_weak(
|
||
|
oldWord,
|
||
|
oldWord | heldBit(slot),
|
||
|
std::memory_order_acquire,
|
||
|
std::memory_order_relaxed));
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
template <unsigned MaxSpins, unsigned MaxYields>
|
||
|
void MicroLockBase<MaxSpins, MaxYields>::lock(unsigned slot) {
|
||
|
static_assert(MaxSpins + MaxYields < (unsigned)-1, "overflow");
|
||
|
|
||
|
detail::Futex<>* wordPtr = word();
|
||
|
uint32_t oldWord;
|
||
|
oldWord = wordPtr->load(std::memory_order_relaxed);
|
||
|
if ((oldWord & heldBit(slot)) == 0 &&
|
||
|
wordPtr->compare_exchange_weak(
|
||
|
oldWord,
|
||
|
oldWord | heldBit(slot),
|
||
|
std::memory_order_acquire,
|
||
|
std::memory_order_relaxed)) {
|
||
|
// Fast uncontended case: memory_order_acquire above is our barrier
|
||
|
} else {
|
||
|
// lockSlowPath doesn't have any slot-dependent computation; it
|
||
|
// just shifts the input bit. Make sure its shifting produces the
|
||
|
// same result a call to waitBit for our slot would.
|
||
|
assert(heldBit(slot) << 1 == waitBit(slot));
|
||
|
// lockSlowPath emits its own memory barrier
|
||
|
lockSlowPath(oldWord, wordPtr, heldBit(slot), MaxSpins, MaxYields);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
typedef MicroLockBase<> MicroLock;
|
||
|
} // namespace folly
|