1730 lines
72 KiB
C
1730 lines
72 KiB
C
|
/*
|
||
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#include <folly/synchronization/DistributedMutex.h>
|
||
|
|
||
|
#include <folly/ConstexprMath.h>
|
||
|
#include <folly/Likely.h>
|
||
|
#include <folly/Portability.h>
|
||
|
#include <folly/ScopeGuard.h>
|
||
|
#include <folly/Utility.h>
|
||
|
#include <folly/chrono/Hardware.h>
|
||
|
#include <folly/detail/Futex.h>
|
||
|
#include <folly/functional/Invoke.h>
|
||
|
#include <folly/lang/Align.h>
|
||
|
#include <folly/lang/Bits.h>
|
||
|
#include <folly/portability/Asm.h>
|
||
|
#include <folly/synchronization/AtomicNotification.h>
|
||
|
#include <folly/synchronization/AtomicUtil.h>
|
||
|
#include <folly/synchronization/detail/InlineFunctionRef.h>
|
||
|
#include <folly/synchronization/detail/Sleeper.h>
|
||
|
|
||
|
#include <glog/logging.h>
|
||
|
|
||
|
#include <array>
|
||
|
#include <atomic>
|
||
|
#include <cstdint>
|
||
|
#include <limits>
|
||
|
#include <stdexcept>
|
||
|
#include <thread>
|
||
|
#include <utility>
|
||
|
|
||
|
namespace folly {
|
||
|
namespace detail {
|
||
|
namespace distributed_mutex {
|
||
|
// kUnlocked is used to show unlocked state
|
||
|
//
|
||
|
// When locking threads encounter kUnlocked in the underlying storage, they
|
||
|
// can just acquire the lock without any further effort
|
||
|
constexpr auto kUnlocked = std::uintptr_t{0b0};
|
||
|
// kLocked is used to show that the mutex is currently locked, and future
|
||
|
// attempts to lock the mutex should enqueue on the central storage
|
||
|
//
|
||
|
// Locking threads find this on central storage only when there is a
|
||
|
// contention chain that is undergoing wakeups, in every other case, a locker
|
||
|
// will either find kUnlocked or an arbitrary address with the kLocked bit set
|
||
|
constexpr auto kLocked = std::uintptr_t{0b1};
|
||
|
// kTimedWaiter is set when there is at least one timed waiter on the mutex
|
||
|
//
|
||
|
// Timed waiters do not follow the sleeping strategy employed by regular,
|
||
|
// non-timed threads. They sleep on the central mutex atomic through an
|
||
|
// extended futex() interface that allows sleeping with the same semantics for
|
||
|
// non-standard integer widths
|
||
|
//
|
||
|
// When a regular non-timed thread unlocks or enqueues on the mutex, and sees
|
||
|
// a timed waiter, it takes ownership of all the timed waiters. The thread
|
||
|
// that has taken ownership of the timed waiter releases the timed waiters
|
||
|
// when it gets a chance at the critical section. At which point it issues a
|
||
|
// wakeup to single timed waiter, timed waiters always issue wake() calls to
|
||
|
// other timed waiters
|
||
|
constexpr auto kTimedWaiter = std::uintptr_t{0b10};
|
||
|
|
||
|
// kUninitialized means that the thread has just enqueued, and has not yet
|
||
|
// gotten to initializing itself with the address of its successor
|
||
|
//
|
||
|
// this becomes significant for threads that are trying to wake up the
|
||
|
// uninitialized thread, if they see that the thread is not yet initialized,
|
||
|
// they can do nothing but spin, and wait for the thread to get initialized
|
||
|
//
|
||
|
// This also plays a role in the functioning of flat combining as implemented
|
||
|
// in DistributedMutex. When a thread owning the lock goes through the
|
||
|
// contention chain to either unlock the mutex or combine critical sections
|
||
|
// from the other end. The presence of kUninitialized means that the
|
||
|
// combining thread is not able to make progress after this point. So we
|
||
|
// transfer the lock.
|
||
|
constexpr auto kUninitialized = std::uint32_t{0b0};
|
||
|
// kWaiting will be set in the waiter's futex structs while they are spinning
|
||
|
// while waiting for the mutex
|
||
|
constexpr auto kWaiting = std::uint32_t{0b1};
|
||
|
// kWake will be set by threads that are waking up waiters that have enqueued
|
||
|
constexpr auto kWake = std::uint32_t{0b10};
|
||
|
// kSkipped will be set by a waker when they see that a waiter has been
|
||
|
// preempted away by the kernel, in this case the thread that got skipped will
|
||
|
// have to wake up and put itself back on the queue
|
||
|
constexpr auto kSkipped = std::uint32_t{0b11};
|
||
|
// kAboutToWait will be set by a waiter that enqueues itself with the purpose
|
||
|
// of waiting on a futex
|
||
|
constexpr auto kAboutToWait = std::uint32_t{0b100};
|
||
|
// kSleeping will be set by a waiter right before enqueueing on a futex. When
|
||
|
// a thread wants to wake up a waiter that has enqueued on a futex, it should
|
||
|
// set the futex to contain kWake
|
||
|
//
|
||
|
// a thread that is unlocking and wants to skip over a sleeping thread also
|
||
|
// calls futex_.exchange(kSleeping) on the sleeping thread's futex word. It
|
||
|
// does this to 1. detect whether the sleeping thread had actually gone to
|
||
|
// sleeping on the futex word so it can skip it, and 2. to synchronize with
|
||
|
// other non atomic writes in the sleeping thread's context (such as the write
|
||
|
// to track the next waiting thread).
|
||
|
//
|
||
|
// We reuse kSleeping instead of say using another constant kEarlyDelivery to
|
||
|
// avoid situations where a thread has to enter kernel mode due to calling
|
||
|
// futexWait() twice because of the presence of a waking thread. This
|
||
|
// situation can arise when an unlocking thread goes to skip over a sleeping
|
||
|
// thread, sees that the thread has slept and move on, but the sleeping thread
|
||
|
// had not yet entered futex(). This interleaving causes the thread calling
|
||
|
// futex() to return spuriously, as the futex word is not what it should be
|
||
|
constexpr auto kSleeping = std::uint32_t{0b101};
|
||
|
// kCombined is set by the lock holder to let the waiter thread know that its
|
||
|
// combine request was successfully completed by the lock holder. A
|
||
|
// successful combine means that the thread requesting the combine operation
|
||
|
// does not need to unlock the mutex; in fact, doing so would be an error.
|
||
|
constexpr auto kCombined = std::uint32_t{0b111};
|
||
|
// kCombineUninitialized is like kUninitialized but is set by a thread when it
|
||
|
// enqueues in hopes of getting its critical section combined with the lock
|
||
|
// holder
|
||
|
constexpr auto kCombineUninitialized = std::uint32_t{0b1000};
|
||
|
// kCombineWaiting is set by a thread when it is ready to have its combine
|
||
|
// record fulfilled by the lock holder. In particular, this signals to the
|
||
|
// lock holder that the thread has set its next_ pointer in the contention
|
||
|
// chain
|
||
|
constexpr auto kCombineWaiting = std::uint32_t{0b1001};
|
||
|
// kExceptionOccurred is set on the waiter futex when the remote task throws
|
||
|
// an exception. It is the caller's responsibility to retrieve the exception
|
||
|
// and rethrow it in their own context. Note that when the caller uses a
|
||
|
// noexcept function as their critical section, they can avoid checking for
|
||
|
// this value
|
||
|
//
|
||
|
// This allows us to avoid all cost of exceptions in the memory layout of the
|
||
|
// fast path (no errors) as exceptions are stored as an std::exception_ptr in
|
||
|
// the same union that stores the return value of the critical section. We
|
||
|
// also avoid all CPU overhead because the combiner uses a try-catch block
|
||
|
// without any additional branching to handle exceptions
|
||
|
constexpr auto kExceptionOccurred = std::uint32_t{0b1010};
|
||
|
|
||
|
// The number of spins that we are allowed to do before we resort to marking a
|
||
|
// thread as having slept
|
||
|
//
|
||
|
// This is just a magic number from benchmarks
|
||
|
constexpr auto kScheduledAwaySpinThreshold = std::chrono::nanoseconds{200};
|
||
|
// The maximum number of spins before a thread starts yielding its processor
|
||
|
// in hopes of getting skipped
|
||
|
constexpr auto kMaxSpins = 4000;
|
||
|
// The maximum number of contention chains we can resolve with flat combining.
|
||
|
// After this number of contention chains, the mutex falls back to regular
|
||
|
// two-phased mutual exclusion to ensure that we don't starve the combiner
|
||
|
// thread
|
||
|
constexpr auto kMaxCombineIterations = 2;
|
||
|
|
||
|
/**
|
||
|
* Write only data that is available to the thread that is waking up another.
|
||
|
* Only the waking thread is allowed to write to this, the thread to be woken
|
||
|
* is allowed to read from this after a wakeup has been issued
|
||
|
*/
|
||
|
template <template <typename> class Atomic>
|
||
|
class WakerMetadata {
|
||
|
public:
|
||
|
// This is the thread that initiated wakeups for the contention chain.
|
||
|
// There can only ever be one thread that initiates the wakeup for a
|
||
|
// chain in the spin only version of this mutex. When a thread that just
|
||
|
// woke up sees this as the next thread to wake up, it knows that it is the
|
||
|
// terminal node in the contention chain. This means that it was the one
|
||
|
// that took off the thread that had acquired the mutex off the centralized
|
||
|
// state. Therefore, the current thread is the last in its contention
|
||
|
// chain. It will fall back to centralized storage to pick up the next
|
||
|
// waiter or release the mutex
|
||
|
//
|
||
|
// When we move to a full sleeping implementation, this might need to change
|
||
|
// to a small_vector<> to account for failed wakeups, or we can put threads
|
||
|
// to sleep on the central futex, which is an easier implementation
|
||
|
// strategy. Although, since this is allocated on the stack, we can set a
|
||
|
// prohitively large threshold to avoid heap allocations, this strategy
|
||
|
// however, might cause increased cache misses on wakeup signalling
|
||
|
std::uintptr_t waker_{0};
|
||
|
// the list of threads that the waker had previously seen to be sleeping on
|
||
|
// a futex(),
|
||
|
//
|
||
|
// this is given to the current thread as a means to pass on
|
||
|
// information. When the current thread goes to unlock the mutex and does
|
||
|
// not see contention, it should go and wake up the head of this list. If
|
||
|
// the current thread sees a contention chain on the mutex, it should pass
|
||
|
// on this list to the next thread that gets woken up
|
||
|
std::uintptr_t waiters_{0};
|
||
|
// The futex that this waiter will sleep on
|
||
|
//
|
||
|
// how can we reuse futex_ from above for futex management?
|
||
|
Futex<Atomic> sleeper_{kUninitialized};
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Type of the type-erased callable that is used for combining from the lock
|
||
|
* holder's end. This has 48 bytes of inline storage that can be used to
|
||
|
* minimize cache misses when combining
|
||
|
*/
|
||
|
using CombineFunction = detail::InlineFunctionRef<void(), 48>;
|
||
|
|
||
|
/**
|
||
|
* Waiter encapsulates the state required for waiting on the mutex, this
|
||
|
* contains potentially heavy state and is intended to be allocated on the
|
||
|
* stack as part of a lock() function call
|
||
|
*
|
||
|
* To ensure that synchronization does not cause unintended side effects on
|
||
|
* the rest of the thread stack (eg. metadata in lockImplementation(), or any
|
||
|
* other data in the user's thread), we aggresively pad this struct and use
|
||
|
* custom alignment internally to ensure that the relevant data fits within a
|
||
|
* single cacheline. The added alignment here also gives us some room to
|
||
|
* wiggle in the bottom few bits of the mutex, where we store extra metadata
|
||
|
*/
|
||
|
template <template <typename> class Atomic>
|
||
|
class Waiter {
|
||
|
public:
|
||
|
Waiter() {}
|
||
|
Waiter(Waiter&&) = delete;
|
||
|
Waiter(const Waiter&) = delete;
|
||
|
Waiter& operator=(Waiter&&) = delete;
|
||
|
Waiter& operator=(const Waiter&) = delete;
|
||
|
|
||
|
void initialize(std::uint64_t futex, CombineFunction task) {
|
||
|
// we only initialize the function if we were actually given a non-null
|
||
|
// task, otherwise
|
||
|
if (task) {
|
||
|
DCHECK_EQ(futex, kCombineUninitialized);
|
||
|
new (&function_) CombineFunction{task};
|
||
|
} else {
|
||
|
DCHECK((futex == kUninitialized) || (futex == kAboutToWait));
|
||
|
new (&metadata_) WakerMetadata<Atomic>{};
|
||
|
}
|
||
|
|
||
|
// this pedantic store is needed to ensure that the waking thread
|
||
|
// synchronizes with the state in the waiter struct when it loads the
|
||
|
// value of the futex word
|
||
|
//
|
||
|
// on x86, this gets optimized away to just a regular store, it might be
|
||
|
// needed on platforms where explicit acquire-release barriers are
|
||
|
// required for synchronization
|
||
|
//
|
||
|
// note that we release here at the end of the constructor because
|
||
|
// construction is complete here, any thread that acquires this release
|
||
|
// will see a well constructed wait node
|
||
|
futex_.store(futex, std::memory_order_release);
|
||
|
}
|
||
|
|
||
|
std::array<std::uint8_t, hardware_destructive_interference_size> padding1;
|
||
|
// the atomic that this thread will spin on while waiting for the mutex to
|
||
|
// be unlocked
|
||
|
alignas(hardware_destructive_interference_size) Atomic<std::uint64_t> futex_{
|
||
|
kUninitialized};
|
||
|
// The successor of this node. This will be the thread that had its address
|
||
|
// on the mutex previously
|
||
|
//
|
||
|
// We can do without making this atomic since the remote thread synchronizes
|
||
|
// on the futex variable above. If this were not atomic, the remote thread
|
||
|
// would only be allowed to read from it after the waiter has moved into the
|
||
|
// waiting state to avoid risk of a load racing with a write. However, it
|
||
|
// helps to make this atomic because we can use an unconditional load and make
|
||
|
// full use of the load buffer to coalesce both reads into a single clock
|
||
|
// cycle after the line arrives in the combiner core. This is a heavily
|
||
|
// contended line, so an RFO from the enqueueing thread is highly likely and
|
||
|
// has the potential to cause an immediate invalidation; blocking the combiner
|
||
|
// thread from making progress until the line is pulled back to read this
|
||
|
// value
|
||
|
//
|
||
|
// Further, making this atomic prevents the compiler from making an incorrect
|
||
|
// optimization where it does not load the value as written in the code, but
|
||
|
// rather dereferences it through a pointer whenever needed (since the value
|
||
|
// of the pointer to this is readily available on the stack). Doing this
|
||
|
// causes multiple invalidation requests from the enqueueing thread, blocking
|
||
|
// remote progress
|
||
|
//
|
||
|
// Note that we use relaxed loads and stores, so this should not have any
|
||
|
// additional overhead compared to a regular load on most architectures
|
||
|
std::atomic<std::uintptr_t> next_{0};
|
||
|
// We use an anonymous union for the combined critical section request and
|
||
|
// the metadata that will be filled in from the leader's end. Only one is
|
||
|
// active at a time - if a leader decides to combine the requested critical
|
||
|
// section into its execution, it will not touch the metadata field. If a
|
||
|
// leader decides to migrate the lock to the waiter, it will not touch the
|
||
|
// function
|
||
|
//
|
||
|
// this allows us to transfer more state when combining a critical section
|
||
|
// and reduce the cache misses originating from executing an arbitrary
|
||
|
// lambda
|
||
|
//
|
||
|
// note that this is an anonymous union, not an unnamed union, the members
|
||
|
// leak into the surrounding scope
|
||
|
union {
|
||
|
// metadata for the waker
|
||
|
WakerMetadata<Atomic> metadata_;
|
||
|
// The critical section that can potentially be combined into the critical
|
||
|
// section of the locking thread
|
||
|
//
|
||
|
// This is kept as a FunctionRef because the original function is preserved
|
||
|
// until the lock_combine() function returns. A consequence of using
|
||
|
// FunctionRef here is that we don't need to do any allocations and can
|
||
|
// allow users to capture unbounded state into the critical section. Flat
|
||
|
// combining means that the user does not have access to the thread
|
||
|
// executing the critical section, so assumptions about thread local
|
||
|
// references can be invalidated. Being able to capture arbitrary state
|
||
|
// allows the user to do thread local accesses right before the critical
|
||
|
// section and pass them as state to the callable being referenced here
|
||
|
CombineFunction function_;
|
||
|
// The user is allowed to use a combined critical section that returns a
|
||
|
// value. This buffer is used to implement the value transfer to the
|
||
|
// waiting thread. We reuse the same union because this helps us combine
|
||
|
// one synchronization operation with a material value transfer.
|
||
|
//
|
||
|
// The waker thread needs to synchronize on this cacheline to issue a
|
||
|
// wakeup to the waiter, meaning that the entire line needs to be pulled
|
||
|
// into the remote core in exclusive mode. So we reuse the coherence
|
||
|
// operation to transfer the return value in addition to the
|
||
|
// synchronization signal. In the case that the user's data item is
|
||
|
// small, the data is transferred all inline as part of the same line,
|
||
|
// which pretty much arrives into the CPU cache in the same clock cycle or
|
||
|
// two after a read-for-ownership request. This gives us a high chance of
|
||
|
// coalescing the entire transitive store buffer together into one cache
|
||
|
// coherence operation from the waker's end. This allows us to make use
|
||
|
// of the CPU bus bandwidth which would have otherwise gone to waste.
|
||
|
// Benchmarks prove this theory under a wide range of contention, value
|
||
|
// sizes, NUMA interactions and processor models
|
||
|
//
|
||
|
// The current version of the Intel optimization manual confirms this
|
||
|
// theory somewhat as well in section 2.3.5.1 (Load and Store Operation
|
||
|
// Overview)
|
||
|
//
|
||
|
// When an instruction writes data to a memory location [...], the
|
||
|
// processor ensures that it has the line containing this memory location
|
||
|
// is in its L1d cache [...]. If the cache line is not there, it fetches
|
||
|
// from the next levels using a RFO request [...] RFO and storing the
|
||
|
// data happens after instruction retirement. Therefore, the store
|
||
|
// latency usually does not affect the store instruction itself
|
||
|
//
|
||
|
// This gives the user the ability to input up to 48 bytes into the
|
||
|
// combined critical section through an InlineFunctionRef and output 48
|
||
|
// bytes from it basically without any cost. The type of the entity
|
||
|
// stored in the buffer has to be matched by the type erased callable that
|
||
|
// the caller has used. At this point, the caller is still in the
|
||
|
// template instantiation leading to the combine request, so it has
|
||
|
// knowledge of the return type and can apply the appropriate
|
||
|
// reinterpret_cast and launder operation to safely retrieve the data from
|
||
|
// this buffer
|
||
|
std::aligned_storage_t<48, 8> storage_;
|
||
|
};
|
||
|
std::array<std::uint8_t, hardware_destructive_interference_size> padding2;
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* A template that helps us differentiate between the different ways to return
|
||
|
* a value from a combined critical section. A return value of type void
|
||
|
* cannot be stored anywhere, so we use specializations and pick the right one
|
||
|
* switched through std::conditional_t
|
||
|
*
|
||
|
* This is then used by CoalescedTask and its family of functions to implement
|
||
|
* efficient return value transfers to the waiting threads
|
||
|
*/
|
||
|
template <typename Func>
|
||
|
class RequestWithReturn {
|
||
|
public:
|
||
|
using F = Func;
|
||
|
using ReturnType = folly::invoke_result_t<const Func&>;
|
||
|
explicit RequestWithReturn(Func func) : func_{std::move(func)} {}
|
||
|
|
||
|
/**
|
||
|
* We need to define the destructor here because C++ requires (with good
|
||
|
* reason) that a union with non-default destructor be explicitly destroyed
|
||
|
* from the surrounding class, as neither the runtime nor compiler have the
|
||
|
* knowledge of what to do with a union at the time of destruction
|
||
|
*
|
||
|
* Each request that has a valid return value set will have the value
|
||
|
* retrieved from the get() method, where the value is destroyed. So we
|
||
|
* don't need to destroy it here
|
||
|
*/
|
||
|
~RequestWithReturn() {}
|
||
|
|
||
|
/**
|
||
|
* This method can be used to return a value from the request. This returns
|
||
|
* the underlying value because return type of the function we were
|
||
|
* instantiated with is not void
|
||
|
*/
|
||
|
ReturnType get() && {
|
||
|
// when the return value has been processed, we destroy the value
|
||
|
// contained in this request. Using a scope_exit means that we don't have
|
||
|
// to worry about storing the value somewhere and causing potentially an
|
||
|
// extra move
|
||
|
//
|
||
|
// note that the invariant here is that this function is only called if the
|
||
|
// requesting thread had it's critical section combined, and the value_
|
||
|
// member constructed through detach()
|
||
|
SCOPE_EXIT {
|
||
|
value_.~ReturnType();
|
||
|
};
|
||
|
return std::move(value_);
|
||
|
}
|
||
|
|
||
|
// this contains a copy of the function the waiter had requested to be
|
||
|
// executed as a combined critical section
|
||
|
Func func_;
|
||
|
// this stores the return value used in the request, we use a union here to
|
||
|
// avoid laundering and allow return types that are not default
|
||
|
// constructible to be propagated through the execution of the critical
|
||
|
// section
|
||
|
//
|
||
|
// note that this is an anonymous union, the member leaks into the
|
||
|
// surrounding scope as a member variable
|
||
|
union {
|
||
|
ReturnType value_;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
template <typename Func>
|
||
|
class RequestWithoutReturn {
|
||
|
public:
|
||
|
using F = Func;
|
||
|
using ReturnType = void;
|
||
|
explicit RequestWithoutReturn(Func func) : func_{std::move(func)} {}
|
||
|
|
||
|
/**
|
||
|
* In this version of the request class, get() returns nothing as there is
|
||
|
* no stored value
|
||
|
*/
|
||
|
void get() && {}
|
||
|
|
||
|
// this contains a copy of the function the waiter had requested to be
|
||
|
// executed as a combined critical section
|
||
|
Func func_;
|
||
|
};
|
||
|
|
||
|
// we need to use std::integral_constant::value here as opposed to
|
||
|
// std::integral_constant::operator T() because MSVC errors out with the
|
||
|
// implicit conversion
|
||
|
template <typename Func>
|
||
|
using Request = std::conditional_t<
|
||
|
std::is_void<folly::invoke_result_t<const Func&>>::value,
|
||
|
RequestWithoutReturn<Func>,
|
||
|
RequestWithReturn<Func>>;
|
||
|
|
||
|
/**
|
||
|
* A template that helps us to transform a callable returning a value to one
|
||
|
* that returns void so it can be type erased and passed on to the waker. If
|
||
|
* the return value is small enough, it gets coalesced into the wait struct
|
||
|
* for optimal data transfer. When it's not small enough to fit in the waiter
|
||
|
* storage buffer, we place it on it's own cacheline with isolation to prevent
|
||
|
* false-sharing with the on-stack metadata of the waiter thread
|
||
|
*
|
||
|
* This helps a combined critical section feel more normal in the case where
|
||
|
* the user wants to return a value, for example
|
||
|
*
|
||
|
* auto value = mutex_.lock_combine([&]() {
|
||
|
* return data_.value();
|
||
|
* });
|
||
|
*
|
||
|
* Without this, the user would typically create a dummy object that they
|
||
|
* would then assign to from within the lambda. With return value chaining,
|
||
|
* this pattern feels more natural
|
||
|
*
|
||
|
* Note that it is important to copy the entire callble into this class.
|
||
|
* Storing something like a reference instead is not desirable because it does
|
||
|
* not allow InlineFunctionRef to use inline storage to represent the user's
|
||
|
* callable without extra indirections
|
||
|
*
|
||
|
* We use std::conditional_t and switch to the right type of task with the
|
||
|
* CoalescedTask type alias
|
||
|
*/
|
||
|
template <typename Func, typename Waiter>
|
||
|
class TaskWithCoalesce {
|
||
|
public:
|
||
|
using ReturnType = folly::invoke_result_t<const Func&>;
|
||
|
using StorageType = folly::Unit;
|
||
|
explicit TaskWithCoalesce(Func func, Waiter& waiter)
|
||
|
: func_{std::move(func)}, waiter_{waiter} {}
|
||
|
|
||
|
void operator()() const {
|
||
|
auto value = func_();
|
||
|
new (&waiter_.storage_) ReturnType{std::move(value)};
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
Func func_;
|
||
|
Waiter& waiter_;
|
||
|
|
||
|
static_assert(!std::is_void<ReturnType>{}, "");
|
||
|
static_assert(alignof(decltype(waiter_.storage_)) >= alignof(ReturnType), "");
|
||
|
static_assert(sizeof(decltype(waiter_.storage_)) >= sizeof(ReturnType), "");
|
||
|
};
|
||
|
|
||
|
template <typename Func, typename Waiter>
|
||
|
class TaskWithoutCoalesce {
|
||
|
public:
|
||
|
using ReturnType = void;
|
||
|
using StorageType = folly::Unit;
|
||
|
explicit TaskWithoutCoalesce(Func func, Waiter&) : func_{std::move(func)} {}
|
||
|
|
||
|
void operator()() const {
|
||
|
func_();
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
Func func_;
|
||
|
};
|
||
|
|
||
|
template <typename Func, typename Waiter>
|
||
|
class TaskWithBigReturnValue {
|
||
|
public:
|
||
|
// Using storage that is aligned on the cacheline boundary helps us avoid a
|
||
|
// situation where the data ends up being allocated on two separate
|
||
|
// cachelines. This would require the remote thread to pull in both lines
|
||
|
// to issue a write.
|
||
|
//
|
||
|
// We also isolate the storage by appending some padding to the end to
|
||
|
// ensure we avoid false-sharing with the metadata used while the waiter
|
||
|
// waits
|
||
|
using ReturnType = folly::invoke_result_t<const Func&>;
|
||
|
static const auto kReturnValueAlignment = folly::constexpr_max(
|
||
|
alignof(ReturnType),
|
||
|
folly::hardware_destructive_interference_size);
|
||
|
using StorageType = std::aligned_storage_t<
|
||
|
sizeof(std::aligned_storage_t<sizeof(ReturnType), kReturnValueAlignment>),
|
||
|
kReturnValueAlignment>;
|
||
|
|
||
|
explicit TaskWithBigReturnValue(Func func, Waiter&)
|
||
|
: func_{std::move(func)} {}
|
||
|
|
||
|
void operator()() const {
|
||
|
DCHECK(storage_);
|
||
|
auto value = func_();
|
||
|
new (storage_) ReturnType{std::move(value)};
|
||
|
}
|
||
|
|
||
|
void attach(StorageType* storage) {
|
||
|
DCHECK(!storage_);
|
||
|
storage_ = storage;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
Func func_;
|
||
|
StorageType* storage_{nullptr};
|
||
|
|
||
|
static_assert(!std::is_void<ReturnType>{}, "");
|
||
|
static_assert(sizeof(Waiter::storage_) < sizeof(ReturnType), "");
|
||
|
};
|
||
|
|
||
|
template <typename T, bool>
|
||
|
struct Sizeof_;
|
||
|
template <typename T>
|
||
|
struct Sizeof_<T, false> : index_constant<sizeof(T)> {};
|
||
|
template <typename T>
|
||
|
struct Sizeof_<T, true> : index_constant<0> {};
|
||
|
template <typename T>
|
||
|
struct Sizeof : Sizeof_<T, std::is_void<T>::value> {};
|
||
|
|
||
|
// we need to use std::integral_constant::value here as opposed to
|
||
|
// std::integral_constant::operator T() because MSVC errors out with the
|
||
|
// implicit conversion
|
||
|
template <typename Func, typename Waiter>
|
||
|
using CoalescedTask = std::conditional_t<
|
||
|
std::is_void<folly::invoke_result_t<const Func&>>::value,
|
||
|
TaskWithoutCoalesce<Func, Waiter>,
|
||
|
std::conditional_t<
|
||
|
Sizeof<folly::invoke_result_t<const Func&>>::value <=
|
||
|
sizeof(Waiter::storage_),
|
||
|
TaskWithCoalesce<Func, Waiter>,
|
||
|
TaskWithBigReturnValue<Func, Waiter>>>;
|
||
|
|
||
|
/**
|
||
|
* Given a request and a wait node, coalesce them into a CoalescedTask that
|
||
|
* coalesces the return value into the wait node when invoked from a remote
|
||
|
* thread
|
||
|
*
|
||
|
* When given a null request through nullptr_t, coalesce() returns null as well
|
||
|
*/
|
||
|
template <typename Waiter>
|
||
|
std::nullptr_t coalesce(std::nullptr_t&, Waiter&) {
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
template <
|
||
|
typename Request,
|
||
|
typename Waiter,
|
||
|
typename Func = typename Request::F>
|
||
|
CoalescedTask<Func, Waiter> coalesce(Request& request, Waiter& waiter) {
|
||
|
static_assert(!std::is_same<Request, std::nullptr_t>{}, "");
|
||
|
return CoalescedTask<Func, Waiter>{request.func_, waiter};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a task, create storage for the return value. When we get a type
|
||
|
* of CoalescedTask, this returns an instance of CoalescedTask::StorageType.
|
||
|
* std::nullptr_t otherwise
|
||
|
*/
|
||
|
inline std::nullptr_t makeReturnValueStorageFor(std::nullptr_t&) {
|
||
|
return {};
|
||
|
}
|
||
|
|
||
|
template <
|
||
|
typename CoalescedTask,
|
||
|
typename StorageType = typename CoalescedTask::StorageType>
|
||
|
StorageType makeReturnValueStorageFor(CoalescedTask&) {
|
||
|
return {};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a task and storage, attach them together if needed. This only helps
|
||
|
* when we have a task that returns a value bigger than can be coalesced. In
|
||
|
* that case, we need to attach the storage with the task so the return value
|
||
|
* can be transferred to this thread from the remote thread
|
||
|
*/
|
||
|
template <typename Task, typename Storage>
|
||
|
void attach(Task&, Storage&) {
|
||
|
static_assert(
|
||
|
std::is_same<Storage, std::nullptr_t>{} ||
|
||
|
std::is_same<Storage, folly::Unit>{},
|
||
|
"");
|
||
|
}
|
||
|
|
||
|
template <
|
||
|
typename R,
|
||
|
typename W,
|
||
|
typename StorageType = typename TaskWithBigReturnValue<R, W>::StorageType>
|
||
|
void attach(TaskWithBigReturnValue<R, W>& task, StorageType& storage) {
|
||
|
task.attach(&storage);
|
||
|
}
|
||
|
|
||
|
template <typename Request, typename Waiter>
|
||
|
void throwIfExceptionOccurred(Request&, Waiter& waiter, bool exception) {
|
||
|
using Storage = decltype(waiter.storage_);
|
||
|
using F = typename Request::F;
|
||
|
static_assert(sizeof(Storage) >= sizeof(std::exception_ptr), "");
|
||
|
static_assert(alignof(Storage) >= alignof(std::exception_ptr), "");
|
||
|
|
||
|
// we only need to check for an exception in the waiter struct if the passed
|
||
|
// callable is not noexcept
|
||
|
//
|
||
|
// we need to make another instance of the exception with automatic storage
|
||
|
// duration and destroy the exception held in the storage *before throwing* to
|
||
|
// avoid leaks. If we don't destroy the exception_ptr in storage, the
|
||
|
// refcount for the internal exception will never hit zero, thereby leaking
|
||
|
// memory
|
||
|
if (UNLIKELY(!folly::is_nothrow_invocable_v<const F&> && exception)) {
|
||
|
auto storage = &waiter.storage_;
|
||
|
auto exc = folly::launder(reinterpret_cast<std::exception_ptr*>(storage));
|
||
|
auto copy = std::move(*exc);
|
||
|
exc->std::exception_ptr::~exception_ptr();
|
||
|
std::rethrow_exception(std::move(copy));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a CoalescedTask, a wait node and a request. Detach the return value
|
||
|
* into the request from the wait node and task.
|
||
|
*/
|
||
|
template <typename Waiter>
|
||
|
void detach(std::nullptr_t&, Waiter&, bool exception, std::nullptr_t&) {
|
||
|
DCHECK(!exception);
|
||
|
}
|
||
|
|
||
|
template <typename Waiter, typename F>
|
||
|
void detach(
|
||
|
RequestWithoutReturn<F>& request,
|
||
|
Waiter& waiter,
|
||
|
bool exception,
|
||
|
folly::Unit&) {
|
||
|
throwIfExceptionOccurred(request, waiter, exception);
|
||
|
}
|
||
|
|
||
|
template <typename Waiter, typename F>
|
||
|
void detach(
|
||
|
RequestWithReturn<F>& request,
|
||
|
Waiter& waiter,
|
||
|
bool exception,
|
||
|
folly::Unit&) {
|
||
|
throwIfExceptionOccurred(request, waiter, exception);
|
||
|
|
||
|
using ReturnType = typename RequestWithReturn<F>::ReturnType;
|
||
|
static_assert(!std::is_same<ReturnType, void>{}, "");
|
||
|
static_assert(sizeof(waiter.storage_) >= sizeof(ReturnType), "");
|
||
|
|
||
|
auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&waiter.storage_));
|
||
|
new (&request.value_) ReturnType{std::move(val)};
|
||
|
val.~ReturnType();
|
||
|
}
|
||
|
|
||
|
template <typename Waiter, typename F, typename Storage>
|
||
|
void detach(
|
||
|
RequestWithReturn<F>& request,
|
||
|
Waiter& waiter,
|
||
|
bool exception,
|
||
|
Storage& storage) {
|
||
|
throwIfExceptionOccurred(request, waiter, exception);
|
||
|
|
||
|
using ReturnType = typename RequestWithReturn<F>::ReturnType;
|
||
|
static_assert(!std::is_same<ReturnType, void>{}, "");
|
||
|
static_assert(sizeof(storage) >= sizeof(ReturnType), "");
|
||
|
|
||
|
auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&storage));
|
||
|
new (&request.value_) ReturnType{std::move(val)};
|
||
|
val.~ReturnType();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the time since epoch in nanoseconds
|
||
|
*
|
||
|
* This is faster than std::chrono::steady_clock because it avoids a VDSO
|
||
|
* access to get the timestamp counter
|
||
|
*
|
||
|
* Note that the hardware timestamp counter on x86, like std::steady_clock is
|
||
|
* guaranteed to be monotonically increasing -
|
||
|
* https://c9x.me/x86/html/file_module_x86_id_278.html
|
||
|
*/
|
||
|
inline std::chrono::nanoseconds time() {
|
||
|
return std::chrono::nanoseconds{hardware_timestamp()};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Zero out the other bits used by the implementation and return just an
|
||
|
* address from a uintptr_t
|
||
|
*/
|
||
|
template <typename Type>
|
||
|
Type* extractPtr(std::uintptr_t from) {
|
||
|
// shift one bit off the end, to get all 1s followed by a single 0
|
||
|
auto mask = std::numeric_limits<std::uintptr_t>::max();
|
||
|
mask >>= 1;
|
||
|
mask <<= 1;
|
||
|
CHECK(!(mask & 0b1));
|
||
|
|
||
|
return folly::bit_cast<Type*>(from & mask);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Strips the given nanoseconds into only the least significant 56 bits by
|
||
|
* moving the least significant 56 bits over by 8 zeroing out the bottom 8
|
||
|
* bits to be used as a medium of information transfer for the thread wait
|
||
|
* nodes
|
||
|
*/
|
||
|
inline std::uint64_t strip(std::chrono::nanoseconds t) {
|
||
|
auto time = t.count();
|
||
|
return static_cast<std::uint64_t>(time) << 8;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Recover the timestamp value from an integer that has the timestamp encoded
|
||
|
* in it
|
||
|
*/
|
||
|
inline std::uint64_t recover(std::uint64_t from) {
|
||
|
return from >> 8;
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
class DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy {
|
||
|
public:
|
||
|
// DistributedMutexStateProxy is move constructible and assignable for
|
||
|
// convenience
|
||
|
DistributedMutexStateProxy(DistributedMutexStateProxy&& other) {
|
||
|
*this = std::move(other);
|
||
|
}
|
||
|
|
||
|
DistributedMutexStateProxy& operator=(DistributedMutexStateProxy&& other) {
|
||
|
DCHECK(!(*this)) << "Cannot move into a valid DistributedMutexStateProxy";
|
||
|
|
||
|
next_ = std::exchange(other.next_, nullptr);
|
||
|
expected_ = std::exchange(other.expected_, 0);
|
||
|
timedWaiters_ = std::exchange(other.timedWaiters_, false);
|
||
|
combined_ = std::exchange(other.combined_, false);
|
||
|
waker_ = std::exchange(other.waker_, 0);
|
||
|
waiters_ = std::exchange(other.waiters_, nullptr);
|
||
|
ready_ = std::exchange(other.ready_, nullptr);
|
||
|
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
// The proxy is valid when a mutex acquisition attempt was successful,
|
||
|
// lock() is guaranteed to return a valid proxy, try_lock() is not
|
||
|
explicit operator bool() const {
|
||
|
return expected_;
|
||
|
}
|
||
|
|
||
|
// private:
|
||
|
// friend the mutex class, since that will be accessing state private to
|
||
|
// this class
|
||
|
friend class DistributedMutex<Atomic, TimePublishing>;
|
||
|
|
||
|
DistributedMutexStateProxy(
|
||
|
Waiter<Atomic>* next,
|
||
|
std::uintptr_t expected,
|
||
|
bool timedWaiter = false,
|
||
|
bool combined = false,
|
||
|
std::uintptr_t waker = 0,
|
||
|
Waiter<Atomic>* waiters = nullptr,
|
||
|
Waiter<Atomic>* ready = nullptr)
|
||
|
: next_{next},
|
||
|
expected_{expected},
|
||
|
timedWaiters_{timedWaiter},
|
||
|
combined_{combined},
|
||
|
waker_{waker},
|
||
|
waiters_{waiters},
|
||
|
ready_{ready} {}
|
||
|
|
||
|
// the next thread that is to be woken up, this being null at the time of
|
||
|
// unlock() shows that the current thread acquired the mutex without
|
||
|
// contention or it was the terminal thread in the queue of threads waking up
|
||
|
Waiter<Atomic>* next_{nullptr};
|
||
|
// this is the value that the current thread should expect to find on
|
||
|
// unlock, and if this value is not there on unlock, the current thread
|
||
|
// should assume that other threads are enqueued waiting for the mutex
|
||
|
//
|
||
|
// note that if the mutex has the same state set at unlock time, and this is
|
||
|
// set to an address (and not say kLocked in the case of a terminal waker)
|
||
|
// then it must have been the case that no other thread had enqueued itself,
|
||
|
// since threads in the domain of this mutex do not share stack space
|
||
|
//
|
||
|
// if we want to support stack sharing, we can solve the problem by looping
|
||
|
// at lock time, and setting a variable that says whether we have acquired
|
||
|
// the lock or not perhaps
|
||
|
std::uintptr_t expected_{0};
|
||
|
// a boolean that will be set when the mutex has timed waiters that the
|
||
|
// current thread is responsible for waking, in such a case, the current
|
||
|
// thread will issue an atomic_notify_one() call after unlocking the mutex
|
||
|
//
|
||
|
// note that a timed waiter will itself always have this flag set. This is
|
||
|
// done so we can avoid having to issue a atomic_notify_all() call (and
|
||
|
// subsequently a thundering herd) when waking up timed-wait threads
|
||
|
bool timedWaiters_{false};
|
||
|
// a boolean that contains true if the state proxy is not meant to be passed
|
||
|
// to the unlock() function. This is set only when there is contention and
|
||
|
// a thread had asked for its critical section to be combined
|
||
|
bool combined_{false};
|
||
|
// metadata passed along from the thread that woke this thread up
|
||
|
std::uintptr_t waker_{0};
|
||
|
// the list of threads that are waiting on a futex
|
||
|
//
|
||
|
// the current threads is meant to wake up this list of waiters if it is
|
||
|
// able to commit an unlock() on the mutex without seeing a contention chain
|
||
|
Waiter<Atomic>* waiters_{nullptr};
|
||
|
// after a thread has woken up from a futex() call, it will have the rest of
|
||
|
// the threads that it were waiting behind it in this list, a thread that
|
||
|
// unlocks has to wake up threads from this list if it has any, before it
|
||
|
// goes to sleep to prevent pathological unfairness
|
||
|
Waiter<Atomic>* ready_{nullptr};
|
||
|
};
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
DistributedMutex<Atomic, TimePublishing>::DistributedMutex()
|
||
|
: state_{kUnlocked} {}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
std::uint64_t publish(
|
||
|
std::uint64_t spins,
|
||
|
bool& shouldPublish,
|
||
|
std::chrono::nanoseconds& previous,
|
||
|
Waiter& waiter,
|
||
|
std::uint32_t waitMode) {
|
||
|
// time publishing has some overhead because it executes an atomic exchange on
|
||
|
// the futex word. If this line is in a remote thread (eg. the combiner),
|
||
|
// then each time we publish a timestamp, this thread has to submit an RFO to
|
||
|
// the remote core for the cacheline, blocking progress for both threads.
|
||
|
//
|
||
|
// the remote core uses a store in the fast path - why then does an RFO make a
|
||
|
// difference? The only educated guess we have here is that the added
|
||
|
// roundtrip delays draining of the store buffer, which essentially exerts
|
||
|
// backpressure on future stores, preventing parallelization
|
||
|
//
|
||
|
// if we have requested a combine, time publishing is less important as it
|
||
|
// only comes into play when the combiner has exhausted their max combine
|
||
|
// passes. So we defer time publishing to the point when the current thread
|
||
|
// gets preempted
|
||
|
auto current = time();
|
||
|
if ((current - previous) >= kScheduledAwaySpinThreshold) {
|
||
|
shouldPublish = true;
|
||
|
}
|
||
|
previous = current;
|
||
|
|
||
|
// if we have requested a combine, and this is the first iteration of the
|
||
|
// wait-loop, we publish a max timestamp to optimistically convey that we have
|
||
|
// not yet been preempted (the remote knows the meaning of max timestamps)
|
||
|
//
|
||
|
// then if we are under the maximum number of spins allowed before sleeping,
|
||
|
// we publish the exact timestamp, otherwise we publish the minimum possible
|
||
|
// timestamp to force the waking thread to skip us
|
||
|
auto now = ((waitMode == kCombineWaiting) && !spins)
|
||
|
? decltype(time())::max()
|
||
|
: (spins < kMaxSpins) ? previous : decltype(time())::zero();
|
||
|
|
||
|
// the wait mode information is published in the bottom 8 bits of the futex
|
||
|
// word, the rest contains time information as computed above. Overflows are
|
||
|
// not really a correctness concern because time publishing is only a
|
||
|
// heuristic. This leaves us 56 bits of nanoseconds (2 years) before we hit
|
||
|
// two consecutive wraparounds, so the lack of bits to respresent time is
|
||
|
// neither a performance nor correctness concern
|
||
|
auto data = strip(now) | waitMode;
|
||
|
auto signal = (shouldPublish || !spins || (waitMode != kCombineWaiting))
|
||
|
? waiter.futex_.exchange(data, std::memory_order_acq_rel)
|
||
|
: waiter.futex_.load(std::memory_order_acquire);
|
||
|
return signal & std::numeric_limits<std::uint8_t>::max();
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
bool spin(Waiter& waiter, std::uint32_t& sig, std::uint32_t mode) {
|
||
|
auto spins = std::uint64_t{0};
|
||
|
auto waitMode = (mode == kCombineUninitialized) ? kCombineWaiting : kWaiting;
|
||
|
auto previous = time();
|
||
|
auto shouldPublish = false;
|
||
|
while (true) {
|
||
|
auto signal = publish(spins++, shouldPublish, previous, waiter, waitMode);
|
||
|
|
||
|
// if we got skipped, make a note of it and return if we got a skipped
|
||
|
// signal or a signal to wake up
|
||
|
auto skipped = (signal == kSkipped);
|
||
|
auto combined = (signal == kCombined);
|
||
|
auto exceptionOccurred = (signal == kExceptionOccurred);
|
||
|
auto woken = (signal == kWake);
|
||
|
if (skipped || woken || combined || exceptionOccurred) {
|
||
|
sig = static_cast<std::uint32_t>(signal);
|
||
|
return !skipped;
|
||
|
}
|
||
|
|
||
|
// if we are under the spin threshold, pause to allow the other
|
||
|
// hyperthread to run. If not, then sleep
|
||
|
if (spins < kMaxSpins) {
|
||
|
asm_volatile_pause();
|
||
|
} else {
|
||
|
Sleeper::sleep();
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
void doFutexWake(Waiter* waiter) {
|
||
|
if (waiter) {
|
||
|
// We can use a simple store operation here and not worry about checking
|
||
|
// to see if the thread had actually started waiting on the futex, that is
|
||
|
// already done in tryWake() when a sleeping thread is collected
|
||
|
//
|
||
|
// We now do not know whether the waiter had already enqueued on the futex
|
||
|
// or whether it had just stored kSleeping in its futex and was about to
|
||
|
// call futexWait(). We treat both these scenarios the same
|
||
|
//
|
||
|
// the below can theoretically cause a problem if we set the
|
||
|
// wake signal and the waiter was in between setting kSleeping in its
|
||
|
// futex and enqueueing on the futex. In this case the waiter will just
|
||
|
// return from futexWait() immediately. This leaves the address that the
|
||
|
// waiter was using for futexWait() possibly dangling, and the thread that
|
||
|
// we woke in the exchange above might have used that address for some
|
||
|
// other object
|
||
|
//
|
||
|
// however, even if the thread had indeed woken up simply becasue of the
|
||
|
// above exchange(), the futexWake() below is not incorrect. It is not
|
||
|
// incorrect because futexWake() does not actually change the memory of
|
||
|
// the futex word. It just uses the address to do a lookup in the kernel
|
||
|
// futex table. And even if we call futexWake() on some other address,
|
||
|
// and that address was being used to wait on futex() that thread will
|
||
|
// protect itself from spurious wakeups, check the value in the futex word
|
||
|
// and enqueue itself back on the futex
|
||
|
//
|
||
|
// this dangilng pointer possibility is why we use a pointer to the futex
|
||
|
// word, and avoid dereferencing after the store() operation
|
||
|
auto sleeper = &waiter->metadata_.sleeper_;
|
||
|
sleeper->store(kWake, std::memory_order_release);
|
||
|
futexWake(sleeper, 1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
bool doFutexWait(Waiter* waiter, Waiter*& next) {
|
||
|
// first we get ready to sleep by calling exchange() on the futex with a
|
||
|
// kSleeping value
|
||
|
DCHECK(waiter->futex_.load(std::memory_order_relaxed) == kAboutToWait);
|
||
|
|
||
|
// note the semantics of using a futex here, when we exchange the sleeper_
|
||
|
// with kSleeping, we are getting ready to sleep, but before sleeping we get
|
||
|
// ready to sleep, and we return from futexWait() when the value of
|
||
|
// sleeper_ might have changed. We can also wake up because of a spurious
|
||
|
// wakeup, so we always check against the value in sleeper_ after returning
|
||
|
// from futexWait(), if the value is not kWake, then we continue
|
||
|
auto pre =
|
||
|
waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
|
||
|
|
||
|
// Seeing a kSleeping on a futex word before we set it ourselves means only
|
||
|
// one thing - an unlocking thread caught us before we went to futex(), and
|
||
|
// we now have the lock, so we abort
|
||
|
//
|
||
|
// if we were given an early delivery, we can return from this function with
|
||
|
// a true, meaning that we now have the lock
|
||
|
if (pre == kSleeping) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// if we reach here then were were not given an early delivery, and any
|
||
|
// thread that goes to wake us up will see a consistent view of the rest of
|
||
|
// the contention chain (since the next_ variable is set before the
|
||
|
// kSleeping exchange above)
|
||
|
while (pre != kWake) {
|
||
|
// before enqueueing on the futex, we wake any waiters that we were
|
||
|
// possibly responsible for
|
||
|
doFutexWake(std::exchange(next, nullptr));
|
||
|
|
||
|
// then we wait on the futex
|
||
|
//
|
||
|
// note that we have to protect ourselves against spurious wakeups here.
|
||
|
// Because the corresponding futexWake() above does not synchronize
|
||
|
// wakeups around the futex word. Because doing so would become
|
||
|
// inefficient
|
||
|
futexWait(&waiter->metadata_.sleeper_, kSleeping);
|
||
|
pre = waiter->metadata_.sleeper_.load(std::memory_order_acquire);
|
||
|
DCHECK((pre == kSleeping) || (pre == kWake));
|
||
|
}
|
||
|
|
||
|
// when coming out of a futex, we might have some other sleeping threads
|
||
|
// that we were supposed to wake up, assign that to the next pointer
|
||
|
DCHECK(next == nullptr);
|
||
|
next = extractPtr<Waiter>(waiter->next_.load(std::memory_order_relaxed));
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
bool wait(Waiter* waiter, std::uint32_t mode, Waiter*& next, uint32_t& signal) {
|
||
|
if (mode == kAboutToWait) {
|
||
|
return doFutexWait(waiter, next);
|
||
|
}
|
||
|
|
||
|
return spin(*waiter, signal, mode);
|
||
|
}
|
||
|
|
||
|
inline void recordTimedWaiterAndClearTimedBit(
|
||
|
bool& timedWaiter,
|
||
|
std::uintptr_t& previous) {
|
||
|
// the previous value in the mutex can never be kTimedWaiter, timed waiters
|
||
|
// always set (kTimedWaiter | kLocked) in the mutex word when they try and
|
||
|
// acquire the mutex
|
||
|
DCHECK(previous != kTimedWaiter);
|
||
|
|
||
|
if (UNLIKELY(previous & kTimedWaiter)) {
|
||
|
// record whether there was a timed waiter in the previous mutex state, and
|
||
|
// clear the timed bit from the previous state
|
||
|
timedWaiter = true;
|
||
|
previous = previous & (~kTimedWaiter);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <typename Atomic>
|
||
|
void wakeTimedWaiters(Atomic* state, bool timedWaiters) {
|
||
|
if (UNLIKELY(timedWaiters)) {
|
||
|
atomic_notify_one(state);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
template <typename Func>
|
||
|
auto DistributedMutex<Atomic, TimePublishing>::lock_combine(Func func)
|
||
|
-> folly::invoke_result_t<const Func&> {
|
||
|
// invoke the lock implementation function and check whether we came out of
|
||
|
// it with our task executed as a combined critical section. This usually
|
||
|
// happens when the mutex is contended.
|
||
|
//
|
||
|
// In the absence of contention, we just return from the try_lock() function
|
||
|
// with the lock acquired. So we need to invoke the task and unlock
|
||
|
// the mutex before returning
|
||
|
auto&& task = Request<Func>{func};
|
||
|
auto&& state = lockImplementation(*this, state_, task);
|
||
|
if (!state.combined_) {
|
||
|
// to avoid having to play a return-value dance when the combinable
|
||
|
// returns void, we use a scope exit to perform the unlock after the
|
||
|
// function return has been processed
|
||
|
SCOPE_EXIT {
|
||
|
unlock(std::move(state));
|
||
|
};
|
||
|
return func();
|
||
|
}
|
||
|
|
||
|
// if we are here, that means we were able to get our request combined, we
|
||
|
// can return the value that was transferred to us
|
||
|
//
|
||
|
// each thread that enqueues as a part of a contention chain takes up the
|
||
|
// responsibility of any timed waiter that had come immediately before it,
|
||
|
// so we wake up timed waiters before exiting the lock function. Another
|
||
|
// strategy might be to add the timed waiter information to the metadata and
|
||
|
// let a single leader wake up a timed waiter for better concurrency. But
|
||
|
// this has proven not to be useful in benchmarks beyond a small 5% delta,
|
||
|
// so we avoid taking the complexity hit and branch to wake up timed waiters
|
||
|
// from each thread
|
||
|
wakeTimedWaiters(&state_, state.timedWaiters_);
|
||
|
return std::move(task).get();
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
||
|
DistributedMutex<Atomic, TimePublishing>::lock() {
|
||
|
auto null = nullptr;
|
||
|
return lockImplementation(*this, state_, null);
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
template <typename Rep, typename Period, typename Func>
|
||
|
folly::Optional<invoke_result_t<Func&>>
|
||
|
DistributedMutex<Atomic, TimePublishing>::try_lock_combine_for(
|
||
|
const std::chrono::duration<Rep, Period>& duration,
|
||
|
Func func) {
|
||
|
auto state = try_lock_for(duration);
|
||
|
if (state) {
|
||
|
SCOPE_EXIT {
|
||
|
unlock(std::move(state));
|
||
|
};
|
||
|
return func();
|
||
|
}
|
||
|
|
||
|
return folly::none;
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
template <typename Clock, typename Duration, typename Func>
|
||
|
folly::Optional<invoke_result_t<Func&>>
|
||
|
DistributedMutex<Atomic, TimePublishing>::try_lock_combine_until(
|
||
|
const std::chrono::time_point<Clock, Duration>& deadline,
|
||
|
Func func) {
|
||
|
auto state = try_lock_until(deadline);
|
||
|
if (state) {
|
||
|
SCOPE_EXIT {
|
||
|
unlock(std::move(state));
|
||
|
};
|
||
|
return func();
|
||
|
}
|
||
|
|
||
|
return folly::none;
|
||
|
}
|
||
|
|
||
|
template <typename Atomic, template <typename> class A, bool T>
|
||
|
auto tryLockNoLoad(Atomic& atomic, DistributedMutex<A, T>&) {
|
||
|
// Try and set the least significant bit of the centralized lock state to 1,
|
||
|
// if this succeeds, it must have been the case that we had a kUnlocked (or
|
||
|
// 0) in the central storage before, since that is the only case where a 0
|
||
|
// can be found in the least significant bit
|
||
|
//
|
||
|
// If this fails, then it is a no-op
|
||
|
using Proxy = typename DistributedMutex<A, T>::DistributedMutexStateProxy;
|
||
|
auto previous = atomic_fetch_set(atomic, 0, std::memory_order_acquire);
|
||
|
if (!previous) {
|
||
|
return Proxy{nullptr, kLocked};
|
||
|
}
|
||
|
|
||
|
return Proxy{nullptr, 0};
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
||
|
DistributedMutex<Atomic, TimePublishing>::try_lock() {
|
||
|
// The lock attempt below requires an expensive atomic fetch-and-mutate or
|
||
|
// an even more expensive atomic compare-and-swap loop depending on the
|
||
|
// platform. These operations require pulling the lock cacheline into the
|
||
|
// current core in exclusive mode and are therefore hard to parallelize
|
||
|
//
|
||
|
// This probabilistically avoids the expense by first checking whether the
|
||
|
// mutex is currently locked
|
||
|
if (state_.load(std::memory_order_relaxed) != kUnlocked) {
|
||
|
return DistributedMutexStateProxy{nullptr, 0};
|
||
|
}
|
||
|
|
||
|
return tryLockNoLoad(state_, *this);
|
||
|
}
|
||
|
|
||
|
template <
|
||
|
template <typename> class Atomic,
|
||
|
bool TimePublishing,
|
||
|
typename State,
|
||
|
typename Request>
|
||
|
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
||
|
lockImplementation(
|
||
|
DistributedMutex<Atomic, TimePublishing>& mutex,
|
||
|
State& atomic,
|
||
|
Request& request) {
|
||
|
// first try and acquire the lock as a fast path, the underlying
|
||
|
// implementation is slightly faster than using std::atomic::exchange() as
|
||
|
// is used in this function. So we get a small perf boost in the
|
||
|
// uncontended case
|
||
|
//
|
||
|
// We only go through this fast path for the lock/unlock usage and avoid this
|
||
|
// for combined critical sections. This check adds unnecessary overhead in
|
||
|
// that case as it causes an extra cacheline bounce
|
||
|
constexpr auto combineRequested = !std::is_same<Request, std::nullptr_t>{};
|
||
|
if (!combineRequested) {
|
||
|
if (auto state = tryLockNoLoad(atomic, mutex)) {
|
||
|
return state;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
auto previous = std::uintptr_t{0};
|
||
|
auto waitMode = combineRequested ? kCombineUninitialized : kUninitialized;
|
||
|
auto nextWaitMode = kAboutToWait;
|
||
|
auto timedWaiter = false;
|
||
|
Waiter<Atomic>* nextSleeper = nullptr;
|
||
|
while (true) {
|
||
|
// construct the state needed to wait
|
||
|
//
|
||
|
// We can't use auto here because MSVC errors out due to a missing copy
|
||
|
// constructor
|
||
|
Waiter<Atomic> state{};
|
||
|
auto&& task = coalesce(request, state);
|
||
|
auto&& storage = makeReturnValueStorageFor(task);
|
||
|
auto&& address = folly::bit_cast<std::uintptr_t>(&state);
|
||
|
attach(task, storage);
|
||
|
state.initialize(waitMode, std::move(task));
|
||
|
DCHECK(!(address & 0b1));
|
||
|
|
||
|
// set the locked bit in the address we will be persisting in the mutex
|
||
|
address |= kLocked;
|
||
|
|
||
|
// attempt to acquire the mutex, mutex acquisition is successful if the
|
||
|
// previous value is zeroed out
|
||
|
//
|
||
|
// we use memory_order_acq_rel here because we want the read-modify-write
|
||
|
// operation to be both acquire and release. Acquire becasue if this is a
|
||
|
// successful lock acquisition, we want to acquire state any other thread
|
||
|
// has released from a prior unlock. We want release semantics becasue
|
||
|
// other threads that read the address of this value should see the full
|
||
|
// well-initialized node we are going to wait on if the mutex acquisition
|
||
|
// was unsuccessful
|
||
|
previous = atomic.exchange(address, std::memory_order_acq_rel);
|
||
|
recordTimedWaiterAndClearTimedBit(timedWaiter, previous);
|
||
|
state.next_.store(previous, std::memory_order_relaxed);
|
||
|
if (previous == kUnlocked) {
|
||
|
return {/* next */ nullptr,
|
||
|
/* expected */ address,
|
||
|
/* timedWaiter */ timedWaiter,
|
||
|
/* combined */ false,
|
||
|
/* waker */ 0,
|
||
|
/* waiters */ nullptr,
|
||
|
/* ready */ nextSleeper};
|
||
|
}
|
||
|
DCHECK(previous & kLocked);
|
||
|
|
||
|
// wait until we get a signal from another thread, if this returns false,
|
||
|
// we got skipped and had probably been scheduled out, so try again
|
||
|
auto signal = kUninitialized;
|
||
|
if (!wait(&state, waitMode, nextSleeper, signal)) {
|
||
|
std::swap(waitMode, nextWaitMode);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// at this point it is safe to access the other fields in the waiter state,
|
||
|
// since the thread that woke us up is gone and nobody will be touching this
|
||
|
// state again, note that this requires memory ordering, and this is why we
|
||
|
// use memory_order_acquire (among other reasons) in the above wait
|
||
|
//
|
||
|
// first we see if the value we took off the mutex state was the thread that
|
||
|
// initated the wakeups, if so, we are the terminal node of the current
|
||
|
// contention chain. If we are the terminal node, then we should expect to
|
||
|
// see a kLocked in the mutex state when we unlock, if we see that, we can
|
||
|
// commit the unlock to the centralized mutex state. If not, we need to
|
||
|
// continue wakeups
|
||
|
//
|
||
|
// a nice consequence of passing kLocked as the current address if we are
|
||
|
// the terminal node is that it naturally just works with the algorithm. If
|
||
|
// we get a contention chain when coming out of a contention chain, the tail
|
||
|
// of the new contention chain will have kLocked set as the previous, which,
|
||
|
// as it happens "just works", since we have now established a recursive
|
||
|
// relationship until broken
|
||
|
auto next = previous;
|
||
|
auto expected = address;
|
||
|
if (previous == state.metadata_.waker_) {
|
||
|
next = 0;
|
||
|
expected = kLocked;
|
||
|
}
|
||
|
|
||
|
// if we were given a combine signal, detach the return value from the
|
||
|
// wait struct into the request, so the current thread can access it
|
||
|
// outside this function
|
||
|
auto combined = (signal == kCombined);
|
||
|
auto exceptionOccurred = (signal == kExceptionOccurred);
|
||
|
if (combined || exceptionOccurred) {
|
||
|
detach(request, state, exceptionOccurred, storage);
|
||
|
}
|
||
|
|
||
|
// if we are just coming out of a futex call, then it means that the next
|
||
|
// waiter we are responsible for is also a waiter waiting on a futex, so
|
||
|
// we return that list in the list of ready threads. We wlil be waking up
|
||
|
// the ready threads on unlock no matter what
|
||
|
return {/* next */ extractPtr<Waiter<Atomic>>(next),
|
||
|
/* expected */ expected,
|
||
|
/* timedWaiter */ timedWaiter,
|
||
|
/* combined */ combineRequested && (combined || exceptionOccurred),
|
||
|
/* waker */ state.metadata_.waker_,
|
||
|
/* waiters */ extractPtr<Waiter<Atomic>>(state.metadata_.waiters_),
|
||
|
/* ready */ nextSleeper};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
inline bool preempted(std::uint64_t value, std::chrono::nanoseconds now) {
|
||
|
auto currentTime = recover(strip(now));
|
||
|
auto nodeTime = recover(value);
|
||
|
auto preempted =
|
||
|
(currentTime > nodeTime + kScheduledAwaySpinThreshold.count()) &&
|
||
|
(nodeTime != recover(strip(std::chrono::nanoseconds::max())));
|
||
|
|
||
|
// we say that the thread has been preempted if its timestamp says so, and
|
||
|
// also if it is neither uninitialized nor skipped
|
||
|
DCHECK(value != kSkipped);
|
||
|
return (preempted) && (value != kUninitialized) &&
|
||
|
(value != kCombineUninitialized);
|
||
|
}
|
||
|
|
||
|
inline bool isSleeper(std::uintptr_t value) {
|
||
|
return (value == kAboutToWait);
|
||
|
}
|
||
|
|
||
|
inline bool isInitialized(std::uintptr_t value) {
|
||
|
return (value != kUninitialized) && (value != kCombineUninitialized);
|
||
|
}
|
||
|
|
||
|
inline bool isCombiner(std::uintptr_t value) {
|
||
|
auto mode = (value & 0xff);
|
||
|
return (mode == kCombineWaiting) || (mode == kCombineUninitialized);
|
||
|
}
|
||
|
|
||
|
inline bool isWaitingCombiner(std::uintptr_t value) {
|
||
|
return (value & 0xff) == kCombineWaiting;
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
CombineFunction loadTask(Waiter* current, std::uintptr_t value) {
|
||
|
// if we know that the waiter is a combiner of some sort, it is safe to read
|
||
|
// and copy the value of the function in the waiter struct, since we know
|
||
|
// that a waiter would have set it before enqueueing
|
||
|
if (isCombiner(value)) {
|
||
|
return current->function_;
|
||
|
}
|
||
|
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
FOLLY_COLD void transferCurrentException(Waiter* waiter) {
|
||
|
DCHECK(std::current_exception());
|
||
|
new (&waiter->storage_) std::exception_ptr{std::current_exception()};
|
||
|
waiter->futex_.store(kExceptionOccurred, std::memory_order_release);
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic>
|
||
|
FOLLY_ALWAYS_INLINE std::uintptr_t tryCombine(
|
||
|
Waiter<Atomic>* waiter,
|
||
|
std::uintptr_t value,
|
||
|
std::uintptr_t next,
|
||
|
std::uint64_t iteration,
|
||
|
std::chrono::nanoseconds now,
|
||
|
CombineFunction task) {
|
||
|
// if the waiter has asked for a combine operation, we should combine its
|
||
|
// critical section and move on to the next waiter
|
||
|
//
|
||
|
// the waiter is combinable if the following conditions are satisfied
|
||
|
//
|
||
|
// 1) the state in the futex word is not uninitialized (kUninitialized)
|
||
|
// 2) it has a valid combine function
|
||
|
// 3) we are not past the limit of the number of combines we can perform
|
||
|
// or the waiter thread been preempted. If the waiter gets preempted,
|
||
|
// its better to just execute their critical section before moving on.
|
||
|
// As they will have to re-queue themselves after preemption anyway,
|
||
|
// leading to further delays in critical section completion
|
||
|
//
|
||
|
// if all the above are satisfied, then we can combine the critical section.
|
||
|
// Note that if the waiter is in a combineable state, that means that it had
|
||
|
// finished its writes to both the task and the next_ value. And observing
|
||
|
// a waiting state also means that we have acquired the writes to the other
|
||
|
// members of the waiter struct, so it's fine to use those values here
|
||
|
if (isWaitingCombiner(value) &&
|
||
|
(iteration <= kMaxCombineIterations || preempted(value, now))) {
|
||
|
try {
|
||
|
task();
|
||
|
waiter->futex_.store(kCombined, std::memory_order_release);
|
||
|
} catch (...) {
|
||
|
transferCurrentException(waiter);
|
||
|
}
|
||
|
return next;
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
FOLLY_ALWAYS_INLINE std::uintptr_t tryWake(
|
||
|
bool publishing,
|
||
|
Waiter* waiter,
|
||
|
std::uintptr_t value,
|
||
|
std::uintptr_t next,
|
||
|
std::uintptr_t waker,
|
||
|
Waiter*& sleepers,
|
||
|
std::uint64_t iteration,
|
||
|
CombineFunction task) {
|
||
|
// try and combine the waiter's request first, if that succeeds that means
|
||
|
// we have successfully executed their critical section and can move on to
|
||
|
// the rest of the chain
|
||
|
auto now = time();
|
||
|
if (tryCombine(waiter, value, next, iteration, now, task)) {
|
||
|
return next;
|
||
|
}
|
||
|
|
||
|
// first we see if we can wake the current thread that is spinning
|
||
|
if ((!publishing || !preempted(value, now)) && !isSleeper(value)) {
|
||
|
// the Metadata class should be trivially destructible as we use placement
|
||
|
// new to set the relevant metadata without calling any destructor. We
|
||
|
// need to use placement new because the class contains a futex, which is
|
||
|
// non-movable and non-copyable
|
||
|
using Metadata = std::decay_t<decltype(waiter->metadata_)>;
|
||
|
static_assert(std::is_trivially_destructible<Metadata>{}, "");
|
||
|
|
||
|
// we need release here because of the write to waker_ and also because we
|
||
|
// are unlocking the mutex, the thread we do the handoff to here should
|
||
|
// see the modified data
|
||
|
new (&waiter->metadata_) Metadata{waker, bit_cast<uintptr_t>(sleepers)};
|
||
|
waiter->futex_.store(kWake, std::memory_order_release);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// if the thread is not a sleeper, and we were not able to catch it before
|
||
|
// preemption, we can just return a false, it is safe to read next_ because
|
||
|
// the thread was preempted. Preemption signals can only come after the
|
||
|
// thread has set the next_ pointer, since the timestamp writes only start
|
||
|
// occurring after that point
|
||
|
//
|
||
|
// if a thread was preempted it must have stored next_ in the waiter struct,
|
||
|
// as the store to futex_ that resets the value from kUninitialized happens
|
||
|
// after the write to next
|
||
|
CHECK(publishing);
|
||
|
if (!isSleeper(value)) {
|
||
|
// go on to the next one
|
||
|
//
|
||
|
// Also, we need a memory_order_release here to prevent missed wakeups. A
|
||
|
// missed wakeup here can happen when we see that a thread had been
|
||
|
// preempted and skip it. Then go on to release the lock, and then when
|
||
|
// the thread which got skipped does an exchange on the central storage,
|
||
|
// still sees the locked bit, and never gets woken up
|
||
|
//
|
||
|
// Can we relax this?
|
||
|
DCHECK(preempted(value, now));
|
||
|
DCHECK(!isCombiner(value));
|
||
|
next = waiter->next_.load(std::memory_order_relaxed);
|
||
|
waiter->futex_.store(kSkipped, std::memory_order_release);
|
||
|
return next;
|
||
|
}
|
||
|
|
||
|
// if we are here the thread is a sleeper
|
||
|
//
|
||
|
// we attempt to catch the thread before it goes to futex(). If we are able
|
||
|
// to catch the thread before it sleeps on a futex, we are done, and don't
|
||
|
// need to go any further
|
||
|
//
|
||
|
// if we are not able to catch the thread before it goes to futex, we
|
||
|
// collect the current thread in the list of sleeping threads represented by
|
||
|
// sleepers, and return the next thread in the list and return false along
|
||
|
// with the previous next value
|
||
|
//
|
||
|
// it is safe to read the next_ pointer in the waiter struct if we were
|
||
|
// unable to catch the thread before it went to futex() because we use
|
||
|
// acquire-release ordering for the exchange operation below. And if we see
|
||
|
// that the thread was already sleeping, we have synchronized with the write
|
||
|
// to next_ in the context of the sleeping thread
|
||
|
//
|
||
|
// Also we need to set the value of waiters_ and waker_ in the thread before
|
||
|
// doing the exchange because we need to pass on the list of sleepers in the
|
||
|
// event that we were able to catch the thread before it went to futex().
|
||
|
// If we were unable to catch the thread before it slept, these fields will
|
||
|
// be ignored when the thread wakes up anyway
|
||
|
DCHECK(isSleeper(value));
|
||
|
waiter->metadata_.waker_ = waker;
|
||
|
waiter->metadata_.waiters_ = folly::bit_cast<std::uintptr_t>(sleepers);
|
||
|
auto pre =
|
||
|
waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
|
||
|
|
||
|
// we were able to catch the thread before it went to sleep, return true
|
||
|
if (pre != kSleeping) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// otherwise return false, with the value of next_, it is safe to read next
|
||
|
// because of the same logic as when a thread was preempted
|
||
|
//
|
||
|
// we also need to collect this sleeper in the list of sleepers being built
|
||
|
// up
|
||
|
next = waiter->next_.load(std::memory_order_relaxed);
|
||
|
auto head = folly::bit_cast<std::uintptr_t>(sleepers);
|
||
|
waiter->next_.store(head, std::memory_order_relaxed);
|
||
|
sleepers = waiter;
|
||
|
return next;
|
||
|
}
|
||
|
|
||
|
template <typename Waiter>
|
||
|
bool wake(
|
||
|
bool publishing,
|
||
|
Waiter& waiter,
|
||
|
std::uintptr_t waker,
|
||
|
Waiter*& sleepers,
|
||
|
std::uint64_t iter) {
|
||
|
// loop till we find a node that is either at the end of the list (as
|
||
|
// specified by waker) or we find a node that is active (as specified by
|
||
|
// the last published timestamp of the node)
|
||
|
auto current = &waiter;
|
||
|
while (current) {
|
||
|
// it is important that we load the value of function and next_ after the
|
||
|
// initial acquire load. This is required because we need to synchronize
|
||
|
// with the construction of the waiter struct before reading from it
|
||
|
//
|
||
|
// the load from the next_ variable is an optimistic load that assumes
|
||
|
// that the waiting thread has probably gone to the waiting state. If the
|
||
|
// waiitng thread is in the waiting state (as revealed by the acquire load
|
||
|
// from the futex word), we will see a well formed next_ value because it
|
||
|
// happens-before the release store to the futex word. The atomic load from
|
||
|
// next_ is an optimization to avoid branching before loading and prevent
|
||
|
// the compiler from eliding the load altogether (and using a pointer
|
||
|
// dereference when needed)
|
||
|
auto value = current->futex_.load(std::memory_order_acquire);
|
||
|
auto next = current->next_.load(std::memory_order_relaxed);
|
||
|
auto task = loadTask(current, value);
|
||
|
next =
|
||
|
tryWake(publishing, current, value, next, waker, sleepers, iter, task);
|
||
|
|
||
|
// if there is no next node, we have managed to wake someone up and have
|
||
|
// successfully migrated the lock to another thread
|
||
|
if (!next) {
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// we need to read the value of the next node in the list before skipping
|
||
|
// it, this is because after we skip it the node might wake up and enqueue
|
||
|
// itself, and thereby gain a new next node
|
||
|
CHECK(publishing);
|
||
|
current = (next == waker) ? nullptr : extractPtr<Waiter>(next);
|
||
|
}
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
template <typename Atomic, typename Proxy, typename Sleepers>
|
||
|
bool tryUnlockClean(Atomic& state, Proxy& proxy, Sleepers sleepers) {
|
||
|
auto expected = proxy.expected_;
|
||
|
while (true) {
|
||
|
if (state.compare_exchange_strong(
|
||
|
expected,
|
||
|
kUnlocked,
|
||
|
std::memory_order_release,
|
||
|
std::memory_order_relaxed)) {
|
||
|
// if we were able to commit an unlocked, we need to wake up the futex
|
||
|
// waiters, if any
|
||
|
doFutexWake(sleepers);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// if we failed the compare_exchange_strong() above, we check to see if
|
||
|
// the failure was because of the presence of a timed waiter. If that
|
||
|
// was the case then we try one more time with the kTimedWaiter bit set
|
||
|
if (UNLIKELY(expected == (proxy.expected_ | kTimedWaiter))) {
|
||
|
proxy.timedWaiters_ = true;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// otherwise break, we have a contention chain
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool Publish>
|
||
|
void DistributedMutex<Atomic, Publish>::unlock(
|
||
|
DistributedMutex::DistributedMutexStateProxy proxy) {
|
||
|
// we always wake up ready threads and timed waiters if we saw either
|
||
|
DCHECK(proxy) << "Invalid proxy passed to DistributedMutex::unlock()";
|
||
|
DCHECK(!proxy.combined_) << "Cannot unlock mutex after a successful combine";
|
||
|
SCOPE_EXIT {
|
||
|
doFutexWake(proxy.ready_);
|
||
|
wakeTimedWaiters(&state_, proxy.timedWaiters_);
|
||
|
};
|
||
|
|
||
|
// if there is a wait queue we are responsible for, try and start wakeups,
|
||
|
// don't bother with the mutex state
|
||
|
auto sleepers = proxy.waiters_;
|
||
|
if (proxy.next_) {
|
||
|
if (wake(Publish, *proxy.next_, proxy.waker_, sleepers, 0)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// At this point, if are in the if statement, we were not the terminal
|
||
|
// node of the wakeup chain. Terminal nodes have the next_ pointer set to
|
||
|
// null in lock()
|
||
|
//
|
||
|
// So we need to pretend we were the end of the contention chain. Coming
|
||
|
// out of a contention chain always has the kLocked state set in the
|
||
|
// mutex. Unless there is another contention chain lined up, which does
|
||
|
// not matter since we are the terminal node anyway
|
||
|
proxy.expected_ = kLocked;
|
||
|
}
|
||
|
|
||
|
for (std::uint64_t i = 0; true; ++i) {
|
||
|
// otherwise, since we don't have anyone we need to wake up, we try and
|
||
|
// release the mutex just as is
|
||
|
//
|
||
|
// if this is successful, we can return, the unlock was successful, we have
|
||
|
// committed a nice kUnlocked to the central storage, yay
|
||
|
if (tryUnlockClean(state_, proxy, sleepers)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// here we have a contention chain built up on the mutex. We grab the
|
||
|
// wait queue and start executing wakeups. We leave a locked bit on the
|
||
|
// centralized storage and handoff control to the head of the queue
|
||
|
//
|
||
|
// we use memory_order_acq_rel here because we want to see the
|
||
|
// full well-initialized node that the other thread is waiting on
|
||
|
//
|
||
|
// If we are unable to wake the contention chain, it is possible that when
|
||
|
// we come back to looping here, a new contention chain will form. In
|
||
|
// that case we need to use kLocked as the waker_ value because the
|
||
|
// terminal node of the new chain will see kLocked in the central storage
|
||
|
auto head = state_.exchange(kLocked, std::memory_order_acq_rel);
|
||
|
recordTimedWaiterAndClearTimedBit(proxy.timedWaiters_, head);
|
||
|
auto next = extractPtr<Waiter<Atomic>>(head);
|
||
|
auto expected = std::exchange(proxy.expected_, kLocked);
|
||
|
DCHECK((head & kLocked) && (head != kLocked)) << "incorrect state " << head;
|
||
|
if (wake(Publish, *next, expected, sleepers, i)) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <typename Atomic, typename Deadline, typename MakeProxy>
|
||
|
auto timedLock(Atomic& state, Deadline deadline, MakeProxy proxy) {
|
||
|
while (true) {
|
||
|
// we put a bit on the central state to show that there is a timed waiter
|
||
|
// and go to sleep on the central state
|
||
|
//
|
||
|
// when this thread goes to unlock the mutex, it will expect a 0b1 in the
|
||
|
// mutex state (0b1, not 0b11), but then it will see that the value in the
|
||
|
// mutex state is 0b11 and not 0b1, meaning that there might have been
|
||
|
// another timed waiter. Even though there might not have been another
|
||
|
// timed waiter in the time being. This sort of missed wakeup is
|
||
|
// desirable for timed waiters; it helps avoid thundering herds of timed
|
||
|
// waiters. Because the mutex is packed in 8 bytes, and we need an
|
||
|
// address to be stored in those 8 bytes, we don't have much room to play
|
||
|
// with. The only other solution is to issue a futexWake(INT_MAX) to wake
|
||
|
// up all waiters when a clean unlock is committed, when a thread saw a
|
||
|
// timed waiter in the mutex previously.
|
||
|
//
|
||
|
// putting a 0b11 here works for a set of reasons that is a superset of
|
||
|
// the set of reasons that make it okay to put a kLocked (0b1) in the
|
||
|
// mutex state. Now that the thread has put (kTimedWaiter | kLocked)
|
||
|
// (0b11) in the mutex state and it expects a kLocked (0b1), there are two
|
||
|
// scenarios possible. The first being when there is no contention chain
|
||
|
// formation in the mutex from the time a timed waiter got a lock to
|
||
|
// unlock. In this case, the unlocker sees a 0b11 in the mutex state,
|
||
|
// adjusts to the presence of a timed waiter and cleanly unlocks with a
|
||
|
// kUnlocked (0b0). The second is when there is a contention chain.
|
||
|
// When a thread puts its address in the mutex and sees the timed bit, it
|
||
|
// records the presence of a timed waiter, and then pretends as if it
|
||
|
// hadn't seen the timed bit. So future contention chain releases, will
|
||
|
// terminate with a kLocked (0b1) and not a (kLocked | kTimedWaiter)
|
||
|
// (0b11). This just works naturally with the rest of the algorithm
|
||
|
// without incurring a perf hit for the regular non-timed case
|
||
|
//
|
||
|
// this strategy does however mean, that when threads try to acquire the
|
||
|
// mutex and all time out, there will be a wasteful syscall to issue wakeups
|
||
|
// to waiting threads. We don't do anything to try and minimize this
|
||
|
//
|
||
|
// we need to use a fetch_or() here because we need to convey two bits of
|
||
|
// information - 1, whether the mutex is locked or not, and 2, whether
|
||
|
// there is a timed waiter. The alternative here is to use the second bit
|
||
|
// to convey information only, we can use a fetch_set() on the second bit
|
||
|
// to make this faster, but that comes at the expense of requiring regular
|
||
|
// fast path lock attempts. Which use a single bit read-modify-write for
|
||
|
// better performance
|
||
|
auto data = kTimedWaiter | kLocked;
|
||
|
auto previous = state.fetch_or(data, std::memory_order_acquire);
|
||
|
if (!(previous & 0b1)) {
|
||
|
DCHECK(!previous);
|
||
|
return proxy(nullptr, kLocked, true);
|
||
|
}
|
||
|
|
||
|
// wait on the futex until signalled, if we get a timeout, the try_lock
|
||
|
// fails
|
||
|
auto result = atomic_wait_until(&state, previous | data, deadline);
|
||
|
if (result == std::cv_status::timeout) {
|
||
|
return proxy(nullptr, std::uintptr_t{0}, false);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
template <typename Clock, typename Duration>
|
||
|
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
||
|
DistributedMutex<Atomic, TimePublishing>::try_lock_until(
|
||
|
const std::chrono::time_point<Clock, Duration>& deadline) {
|
||
|
// fast path for the uncontended case
|
||
|
//
|
||
|
// we get the time after trying to acquire the mutex because in the
|
||
|
// uncontended case, the price of getting the time is about 1/3 of the
|
||
|
// actual mutex acquisition. So we only pay the price of that extra bit of
|
||
|
// latency when needed
|
||
|
//
|
||
|
// this is even higher when VDSO is involved on architectures that do not
|
||
|
// offer a direct interface to the timestamp counter
|
||
|
if (auto state = try_lock()) {
|
||
|
return state;
|
||
|
}
|
||
|
|
||
|
// fall back to the timed locking algorithm
|
||
|
using Proxy = DistributedMutexStateProxy;
|
||
|
return timedLock(state_, deadline, [](auto... as) { return Proxy{as...}; });
|
||
|
}
|
||
|
|
||
|
template <template <typename> class Atomic, bool TimePublishing>
|
||
|
template <typename Rep, typename Period>
|
||
|
typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
|
||
|
DistributedMutex<Atomic, TimePublishing>::try_lock_for(
|
||
|
const std::chrono::duration<Rep, Period>& duration) {
|
||
|
// fast path for the uncontended case. Reasoning for doing this here is the
|
||
|
// same as in try_lock_until()
|
||
|
if (auto state = try_lock()) {
|
||
|
return state;
|
||
|
}
|
||
|
|
||
|
// fall back to the timed locking algorithm
|
||
|
using Proxy = DistributedMutexStateProxy;
|
||
|
auto deadline = std::chrono::steady_clock::now() + duration;
|
||
|
return timedLock(state_, deadline, [](auto... as) { return Proxy{as...}; });
|
||
|
}
|
||
|
} // namespace distributed_mutex
|
||
|
} // namespace detail
|
||
|
} // namespace folly
|