/*
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <algorithm>
#include <atomic>
#include <cstdint>
#include <cstring>
#include <memory>
#include <system_error>

#include <folly/CPortability.h>
#include <folly/IndexedMemPool.h>
#include <folly/Likely.h>
#include <folly/Portability.h>
#include <folly/Traits.h>
#include <folly/detail/StaticSingletonManager.h>
#include <folly/lang/Aligned.h>
#include <folly/lang/SafeAssert.h>
#include <folly/synchronization/AtomicStruct.h>
#include <folly/synchronization/SaturatingSemaphore.h>

namespace folly {

template <
    template <typename> class Atom = std::atomic,
    class BatonType = SaturatingSemaphore<true, Atom>>
struct LifoSemImpl;

/// LifoSem is a semaphore that wakes its waiters in a manner intended to
/// maximize performance rather than fairness.  It should be preferred
/// to a mutex+condvar or POSIX sem_t solution when all of the waiters
/// are equivalent.  It is faster than a condvar or sem_t, and it has a
/// shutdown state that might save you a lot of complexity when it comes
/// time to shut down your work pipelines.  LifoSem is larger than sem_t,
/// but that is only because it uses padding and alignment to avoid
/// false sharing.
///
/// LifoSem allows multi-post and multi-tryWait, and provides a shutdown
/// state that awakens all waiters.  LifoSem is faster than sem_t because
/// it performs exact wakeups, so it often requires fewer system calls.
/// It provides all of the functionality of sem_t except for timed waiting.
/// It is called LifoSem because its wakeup policy is approximately LIFO,
/// rather than the usual FIFO.
///
/// The core semaphore operations provided are:
///
/// -- post() -- if there is a pending waiter, wake it up, otherwise
/// increment the value of the semaphore.  If the value of the semaphore
/// is already 2^32-1, does nothing.  Compare to sem_post().
///
/// -- post(n) -- equivalent to n calls to post(), but much more efficient.
/// sem_t has no equivalent to this method.
///
/// -- bool tryWait() -- if the semaphore's value is positive, decrements it
/// and returns true, otherwise returns false.  Compare to sem_trywait().
///
/// -- uint32_t tryWait(uint32_t n) -- attempts to decrement the semaphore's
/// value by n, returning the amount by which it actually was decremented
/// (a value from 0 to n inclusive).  Not atomic.  Equivalent to n calls
/// to tryWait().  sem_t has no equivalent to this method.
///
/// -- wait() -- waits until tryWait() can succeed.  Compare to sem_wait().
///
/// -- timed wait variants - will wait until timeout.  Note when these
///    timeout, the current implementation takes a lock, blocking
///    concurrent pushes and pops.  (If timed wait calls are
///    substantial, consider re-working this code to be lock-free).
///
/// LifoSem also has the notion of a shutdown state, in which any calls
/// that would block (or are already blocked) throw ShutdownSemError.
/// Note the difference between a call to wait() and a call to wait()
/// that might block.  In the former case tryWait() would succeed, and no
/// isShutdown() check is performed.  In the latter case an exception is
/// thrown.  This behavior allows a LifoSem controlling work distribution
/// to drain.  If you want to immediately stop all waiting on shutdown,
/// you can just check isShutdown() yourself (preferrably wrapped in
/// an UNLIKELY).  This fast-stop behavior is easy to add, but difficult
/// to remove if you want the draining behavior, which is why we have
/// chosen the former.
///
/// All LifoSem operations except valueGuess() are guaranteed to be
/// linearizable.
typedef LifoSemImpl<> LifoSem;

/// The exception thrown when wait()ing on an isShutdown() LifoSem
class FOLLY_EXPORT ShutdownSemError : public std::runtime_error {
 public:
  using std::runtime_error::runtime_error;
};

namespace detail {

// Internally, a LifoSem is either a value or a linked list of wait nodes.
// This union is captured in the LifoSemHead type, which holds either a
// value or an indexed pointer to the list.  LifoSemHead itself is a value
// type, the head is a mutable atomic box containing a LifoSemHead value.
// Each wait node corresponds to exactly one waiter.  Values can flow
// through the semaphore either by going into and out of the head's value,
// or by direct communication from a poster to a waiter.  The former path
// is taken when there are no pending waiters, the latter otherwise.  The
// general flow of a post is to try to increment the value or pop-and-post
// a wait node.  Either of those have the effect of conveying one semaphore
// unit.  Waiting is the opposite, either a decrement of the value or
// push-and-wait of a wait node.  The generic LifoSemBase abstracts the
// actual mechanism by which a wait node's post->wait communication is
// performed, which is why we have LifoSemRawNode and LifoSemNode.

/// LifoSemRawNode is the actual pooled storage that backs LifoSemNode
/// for user-specified Handoff types.  This is done so that we can have
/// a large static IndexedMemPool of nodes, instead of per-type pools
template <template <typename> class Atom>
struct LifoSemRawNode {
  aligned_storage_for_t<void*> raw;

  /// The IndexedMemPool index of the next node in this chain, or 0
  /// if none.  This will be set to uint32_t(-1) if the node is being
  /// posted due to a shutdown-induced wakeup
  Atom<uint32_t> next{0};

  bool isShutdownNotice() const {
    return next.load(std::memory_order_relaxed) == uint32_t(-1);
  }
  void clearShutdownNotice() {
    next.store(0, std::memory_order_relaxed);
  }
  void setShutdownNotice() {
    next.store(uint32_t(-1), std::memory_order_relaxed);
  }

  typedef folly::IndexedMemPool<
      LifoSemRawNode<Atom>,
      32,
      200,
      Atom,
      IndexedMemPoolTraitsLazyRecycle<LifoSemRawNode<Atom>>>
      Pool;

  /// Storage for all of the waiter nodes for LifoSem-s that use Atom
  static Pool& pool() {
    return detail::createGlobal<PoolImpl, void>();
  }

 private:
  struct PoolImpl : Pool {
    /// Raw node storage is preallocated in a contiguous memory segment,
    /// but we use an anonymous mmap so the physical memory used (RSS) will
    /// only reflect the maximum number of waiters that actually existed
    /// concurrently.  For blocked threads the max node count is limited by the
    /// number of threads, so we can conservatively estimate that this will be
    /// < 10k.  For LifoEventSem, however, we could potentially have many more.
    ///
    /// On a 64-bit architecture each LifoSemRawNode takes 16 bytes.  We make
    /// the pool 1 million entries.
    static constexpr size_t capacity = 1 << 20;

    PoolImpl() : Pool(static_cast<uint32_t>(capacity)) {}
  };
};

/// Handoff is a type not bigger than a void* that knows how to perform a
/// single post() -> wait() communication.  It must have a post() method.
/// If it has a wait() method then LifoSemBase's wait() implementation
/// will work out of the box, otherwise you will need to specialize
/// LifoSemBase::wait accordingly.
template <typename Handoff, template <typename> class Atom>
struct LifoSemNode : public LifoSemRawNode<Atom> {
  static_assert(
      sizeof(Handoff) <= sizeof(LifoSemRawNode<Atom>::raw),
      "Handoff too big for small-object optimization, use indirection");
  static_assert(
      alignof(Handoff) <= alignof(decltype(LifoSemRawNode<Atom>::raw)),
      "Handoff alignment constraint not satisfied");

  template <typename... Args>
  void init(Args&&... args) {
    new (&this->raw) Handoff(std::forward<Args>(args)...);
  }

  void destroy() {
    handoff().~Handoff();
    if (kIsDebug) {
      memset(&this->raw, 'F', sizeof(this->raw));
    }
  }

  Handoff& handoff() {
    return *static_cast<Handoff*>(static_cast<void*>(&this->raw));
  }

  const Handoff& handoff() const {
    return *static_cast<const Handoff*>(static_cast<const void*>(&this->raw));
  }
};

template <typename Handoff, template <typename> class Atom>
struct LifoSemNodeRecycler {
  void operator()(LifoSemNode<Handoff, Atom>* elem) const {
    elem->destroy();
    auto idx = LifoSemRawNode<Atom>::pool().locateElem(elem);
    LifoSemRawNode<Atom>::pool().recycleIndex(idx);
  }
};

/// LifoSemHead is a 64-bit struct that holds a 32-bit value, some state
/// bits, and a sequence number used to avoid ABA problems in the lock-free
/// management of the LifoSem's wait lists.  The value can either hold
/// an integral semaphore value (if there are no waiters) or a node index
/// (see IndexedMemPool) for the head of a list of wait nodes
class LifoSemHead {
  // What we really want are bitfields:
  //  uint64_t data : 32; uint64_t isNodeIdx : 1; uint64_t seq : 31;
  // Unfortunately g++ generates pretty bad code for this sometimes (I saw
  // -O3 code from gcc 4.7.1 copying the bitfields one at a time instead of
  // in bulk, for example).  We can generate better code anyway by assuming
  // that setters won't be given values that cause under/overflow, and
  // putting the sequence at the end where its planned overflow doesn't
  // need any masking.
  //
  // data == 0 (empty list) with isNodeIdx is conceptually the same
  // as data == 0 (no unclaimed increments) with !isNodeIdx, we always
  // convert the former into the latter to make the logic simpler.
  enum {
    IsNodeIdxShift = 32,
    IsShutdownShift = 33,
    IsLockedShift = 34,
    SeqShift = 35,
  };
  enum : uint64_t {
    IsNodeIdxMask = uint64_t(1) << IsNodeIdxShift,
    IsShutdownMask = uint64_t(1) << IsShutdownShift,
    IsLockedMask = uint64_t(1) << IsLockedShift,
    SeqIncr = uint64_t(1) << SeqShift,
    SeqMask = ~(SeqIncr - 1),
  };

 public:
  uint64_t bits;

  //////// getters

  inline uint32_t idx() const {
    assert(isNodeIdx());
    assert(uint32_t(bits) != 0);
    return uint32_t(bits);
  }
  inline uint32_t value() const {
    assert(!isNodeIdx());
    return uint32_t(bits);
  }
  inline constexpr bool isNodeIdx() const {
    return (bits & IsNodeIdxMask) != 0;
  }
  inline constexpr bool isShutdown() const {
    return (bits & IsShutdownMask) != 0;
  }
  inline constexpr bool isLocked() const {
    return (bits & IsLockedMask) != 0;
  }
  inline constexpr uint32_t seq() const {
    return uint32_t(bits >> SeqShift);
  }

  //////// setter-like things return a new struct

  /// This should only be used for initial construction, not for setting
  /// the value, because it clears the sequence number
  static inline constexpr LifoSemHead fresh(uint32_t value) {
    return LifoSemHead{value};
  }

  /// Returns the LifoSemHead that results from popping a waiter node,
  /// given the current waiter node's next ptr
  inline LifoSemHead withPop(uint32_t idxNext) const {
    assert(!isLocked());
    assert(isNodeIdx());
    if (idxNext == 0) {
      // no isNodeIdx bit or data bits.  Wraparound of seq bits is okay
      return LifoSemHead{(bits & (SeqMask | IsShutdownMask)) + SeqIncr};
    } else {
      // preserve sequence bits (incremented with wraparound okay) and
      // isNodeIdx bit, replace all data bits
      return LifoSemHead{(bits & (SeqMask | IsShutdownMask | IsNodeIdxMask)) +
                         SeqIncr + idxNext};
    }
  }

  /// Returns the LifoSemHead that results from pushing a new waiter node
  inline LifoSemHead withPush(uint32_t _idx) const {
    assert(!isLocked());
    assert(isNodeIdx() || value() == 0);
    assert(!isShutdown());
    assert(_idx != 0);
    return LifoSemHead{(bits & SeqMask) | IsNodeIdxMask | _idx};
  }

  /// Returns the LifoSemHead with value increased by delta, with
  /// saturation if the maximum value is reached
  inline LifoSemHead withValueIncr(uint32_t delta) const {
    assert(!isLocked());
    assert(!isNodeIdx());
    auto rv = LifoSemHead{bits + SeqIncr + delta};
    if (UNLIKELY(rv.isNodeIdx())) {
      // value has overflowed into the isNodeIdx bit
      rv = LifoSemHead{(rv.bits & ~IsNodeIdxMask) | (IsNodeIdxMask - 1)};
    }
    return rv;
  }

  /// Returns the LifoSemHead that results from decrementing the value
  inline LifoSemHead withValueDecr(uint32_t delta) const {
    assert(!isLocked());
    assert(delta > 0 && delta <= value());
    return LifoSemHead{bits + SeqIncr - delta};
  }

  /// Returns the LifoSemHead with the same state as the current node,
  /// but with the shutdown bit set
  inline LifoSemHead withShutdown() const {
    return LifoSemHead{bits | IsShutdownMask};
  }

  // Returns LifoSemHead with lock bit set, but rest of bits unchanged.
  inline LifoSemHead withLock() const {
    assert(!isLocked());
    return LifoSemHead{bits | IsLockedMask};
  }

  // Returns LifoSemHead with lock bit unset, and updated seqno based
  // on idx.
  inline LifoSemHead withoutLock(uint32_t idxNext) const {
    assert(isLocked());
    // We need to treat this as a pop, as we may change the list head.
    return LifoSemHead{bits & ~IsLockedMask}.withPop(idxNext);
  }

  inline constexpr bool operator==(const LifoSemHead& rhs) const {
    return bits == rhs.bits;
  }
  inline constexpr bool operator!=(const LifoSemHead& rhs) const {
    return !(*this == rhs);
  }
};

/// LifoSemBase is the engine for several different types of LIFO
/// semaphore.  LifoSemBase handles storage of positive semaphore values
/// and wait nodes, but the actual waiting and notification mechanism is
/// up to the client.
///
/// The Handoff type is responsible for arranging one wakeup notification.
/// See LifoSemNode for more information on how to make your own.
template <typename Handoff, template <typename> class Atom = std::atomic>
struct LifoSemBase {
  /// Constructor
  constexpr explicit LifoSemBase(uint32_t initialValue = 0)
      : head_(in_place, LifoSemHead::fresh(initialValue)) {}

  LifoSemBase(LifoSemBase const&) = delete;
  LifoSemBase& operator=(LifoSemBase const&) = delete;

  /// Silently saturates if value is already 2^32-1
  bool post() {
    auto idx = incrOrPop(1);
    if (idx != 0) {
      idxToNode(idx).handoff().post();
      return true;
    }
    return false;
  }

  /// Equivalent to n calls to post(), except may be much more efficient.
  /// At any point in time at which the semaphore's value would exceed
  /// 2^32-1 if tracked with infinite precision, it may be silently
  /// truncated to 2^32-1.  This saturation is not guaranteed to be exact,
  /// although it is guaranteed that overflow won't result in wrap-around.
  /// There would be a substantial performance and complexity cost in
  /// guaranteeing exact saturation (similar to the cost of maintaining
  /// linearizability near the zero value, but without as much of
  /// a benefit).
  void post(uint32_t n) {
    uint32_t idx;
    while (n > 0 && (idx = incrOrPop(n)) != 0) {
      // pop accounts for only 1
      idxToNode(idx).handoff().post();
      --n;
    }
  }

  /// Returns true iff shutdown() has been called
  bool isShutdown() const {
    return UNLIKELY(head_->load(std::memory_order_acquire).isShutdown());
  }

  /// Prevents blocking on this semaphore, causing all blocking wait()
  /// calls to throw ShutdownSemError.  Both currently blocked wait() and
  /// future calls to wait() for which tryWait() would return false will
  /// cause an exception.  Calls to wait() for which the matching post()
  /// has already occurred will proceed normally.
  void shutdown() {
    // first set the shutdown bit
    auto h = head_->load(std::memory_order_acquire);
    while (!h.isShutdown()) {
      if (h.isLocked()) {
        std::this_thread::yield();
        h = head_->load(std::memory_order_acquire);
        continue;
      }

      if (head_->compare_exchange_strong(h, h.withShutdown())) {
        // success
        h = h.withShutdown();
        break;
      }
      // compare_exchange_strong rereads h, retry
    }

    // now wake up any waiters
    while (h.isNodeIdx()) {
      if (h.isLocked()) {
        std::this_thread::yield();
        h = head_->load(std::memory_order_acquire);
        continue;
      }
      auto& node = idxToNode(h.idx());
      auto repl = h.withPop(node.next.load(std::memory_order_relaxed));
      if (head_->compare_exchange_strong(h, repl)) {
        // successful pop, wake up the waiter and move on.  The next
        // field is used to convey that this wakeup didn't consume a value
        node.setShutdownNotice();
        node.handoff().post();
        h = repl;
      }
    }
  }

  /// Returns true iff value was decremented
  bool tryWait() {
    uint32_t n = 1;
    auto rv = decrOrPush(n, 0);
    assert(
        (rv == WaitResult::DECR && n == 0) ||
        (rv != WaitResult::DECR && n == 1));
    // SHUTDOWN is okay here, since we don't actually wait
    return rv == WaitResult::DECR;
  }

  /// Equivalent to (but may be much more efficient than) n calls to
  /// tryWait().  Returns the total amount by which the semaphore's value
  /// was decreased
  uint32_t tryWait(uint32_t n) {
    auto const orig = n;
    while (n > 0) {
#ifndef NDEBUG
      auto prev = n;
#endif
      auto rv = decrOrPush(n, 0);
      assert(
          (rv == WaitResult::DECR && n < prev) ||
          (rv != WaitResult::DECR && n == prev));
      if (rv != WaitResult::DECR) {
        break;
      }
    }
    return orig - n;
  }

  /// Blocks the current thread until there is a matching post or the
  /// semaphore is shut down.  Throws ShutdownSemError if the semaphore
  /// has been shut down and this method would otherwise be blocking.
  /// Note that wait() doesn't throw during shutdown if tryWait() would
  /// return true
  void wait() {
    auto const deadline = std::chrono::steady_clock::time_point::max();
    auto res = try_wait_until(deadline);
    FOLLY_SAFE_DCHECK(res, "infinity time has passed");
  }

  bool try_wait() {
    return tryWait();
  }

  template <typename Rep, typename Period>
  bool try_wait_for(const std::chrono::duration<Rep, Period>& timeout) {
    return try_wait_until(timeout + std::chrono::steady_clock::now());
  }

  template <typename Clock, typename Duration>
  bool try_wait_until(
      const std::chrono::time_point<Clock, Duration>& deadline) {
    // early check isn't required for correctness, but is an important
    // perf win if we can avoid allocating and deallocating a node
    if (tryWait()) {
      return true;
    }

    // allocateNode() won't compile unless Handoff has a default
    // constructor
    UniquePtr node = allocateNode();

    auto rv = tryWaitOrPush(*node);
    if (UNLIKELY(rv == WaitResult::SHUTDOWN)) {
      assert(isShutdown());
      throw ShutdownSemError("wait() would block but semaphore is shut down");
    }

    if (rv == WaitResult::PUSH) {
      if (!node->handoff().try_wait_until(deadline)) {
        if (tryRemoveNode(*node)) {
          return false;
        } else {
          // We could not remove our node. Return to waiting.
          //
          // This only happens if we lose a removal race with post(),
          // so we are not likely to wait long.  This is only
          // necessary to ensure we don't return node's memory back to
          // IndexedMemPool before post() has had a chance to post to
          // handoff().  In a stronger memory reclamation scheme, such
          // as hazptr or rcu, this would not be necessary.
          node->handoff().wait();
        }
      }
      if (UNLIKELY(node->isShutdownNotice())) {
        // this wait() didn't consume a value, it was triggered by shutdown
        throw ShutdownSemError(
            "blocking wait() interrupted by semaphore shutdown");
      }

      // node->handoff().wait() can't return until after the node has
      // been popped and post()ed, so it is okay for the UniquePtr to
      // recycle the node now
    }
    // else node wasn't pushed, so it is safe to recycle
    return true;
  }

  /// Returns a guess at the current value, designed for debugging.
  /// If there are no concurrent posters or waiters then this will
  /// be correct
  uint32_t valueGuess() const {
    // this is actually linearizable, but we don't promise that because
    // we may want to add striping in the future to help under heavy
    // contention
    auto h = head_->load(std::memory_order_acquire);
    return h.isNodeIdx() ? 0 : h.value();
  }

 protected:
  enum class WaitResult {
    PUSH,
    DECR,
    SHUTDOWN,
  };

  /// The type of a std::unique_ptr that will automatically return a
  /// LifoSemNode to the appropriate IndexedMemPool
  typedef std::
      unique_ptr<LifoSemNode<Handoff, Atom>, LifoSemNodeRecycler<Handoff, Atom>>
          UniquePtr;

  /// Returns a node that can be passed to decrOrLink
  template <typename... Args>
  UniquePtr allocateNode(Args&&... args) {
    auto idx = LifoSemRawNode<Atom>::pool().allocIndex();
    if (idx != 0) {
      auto& node = idxToNode(idx);
      node.clearShutdownNotice();
      try {
        node.init(std::forward<Args>(args)...);
      } catch (...) {
        LifoSemRawNode<Atom>::pool().recycleIndex(idx);
        throw;
      }
      return UniquePtr(&node);
    } else {
      return UniquePtr();
    }
  }

  /// Returns DECR if the semaphore value was decremented (and waiterNode
  /// was untouched), PUSH if a reference to the wait node was pushed,
  /// or SHUTDOWN if decrement was not possible and push wasn't allowed
  /// because isShutdown().  Ownership of the wait node remains the
  /// responsibility of the caller, who must not release it until after
  /// the node's Handoff has been posted.
  WaitResult tryWaitOrPush(LifoSemNode<Handoff, Atom>& waiterNode) {
    uint32_t n = 1;
    return decrOrPush(n, nodeToIdx(waiterNode));
  }

  // Locks the list head (blocking concurrent pushes and pops)
  // and attempts to remove this node.  Returns true if node was
  // found and removed, false if not found.
  bool tryRemoveNode(const LifoSemNode<Handoff, Atom>& removenode) {
    auto removeidx = nodeToIdx(removenode);
    auto head = head_->load(std::memory_order_acquire);
    // Try to lock the head.
    while (true) {
      if (head.isLocked()) {
        std::this_thread::yield();
        head = head_->load(std::memory_order_acquire);
        continue;
      }
      if (!head.isNodeIdx()) {
        return false;
      }
      if (head_->compare_exchange_weak(
              head,
              head.withLock(),
              std::memory_order_acquire,
              std::memory_order_relaxed)) {
        break;
      }
    }
    // Update local var to what head_ is, for better assert() checking.
    head = head.withLock();
    bool result = false;
    auto idx = head.idx();
    if (idx == removeidx) {
      // pop from head.  Head seqno is updated.
      head_->store(
          head.withoutLock(removenode.next.load(std::memory_order_relaxed)),
          std::memory_order_release);
      return true;
    }
    auto node = &idxToNode(idx);
    idx = node->next.load(std::memory_order_relaxed);
    while (idx) {
      if (idx == removeidx) {
        // Pop from mid-list.
        node->next.store(
            removenode.next.load(std::memory_order_relaxed),
            std::memory_order_relaxed);
        result = true;
        break;
      }
      node = &idxToNode(idx);
      idx = node->next.load(std::memory_order_relaxed);
    }
    // Unlock and return result
    head_->store(head.withoutLock(head.idx()), std::memory_order_release);
    return result;
  }

 private:
  cacheline_aligned<folly::AtomicStruct<LifoSemHead, Atom>> head_;

  static LifoSemNode<Handoff, Atom>& idxToNode(uint32_t idx) {
    auto raw = &LifoSemRawNode<Atom>::pool()[idx];
    return *static_cast<LifoSemNode<Handoff, Atom>*>(raw);
  }

  static uint32_t nodeToIdx(const LifoSemNode<Handoff, Atom>& node) {
    return LifoSemRawNode<Atom>::pool().locateElem(&node);
  }

  /// Either increments by n and returns 0, or pops a node and returns it.
  /// If n + the stripe's value overflows, then the stripe's value
  /// saturates silently at 2^32-1
  uint32_t incrOrPop(uint32_t n) {
    while (true) {
      assert(n > 0);

      auto head = head_->load(std::memory_order_acquire);
      if (head.isLocked()) {
        std::this_thread::yield();
        continue;
      }
      if (head.isNodeIdx()) {
        auto& node = idxToNode(head.idx());
        if (head_->compare_exchange_strong(
                head,
                head.withPop(node.next.load(std::memory_order_relaxed)))) {
          // successful pop
          return head.idx();
        }
      } else {
        auto after = head.withValueIncr(n);
        if (head_->compare_exchange_strong(head, after)) {
          // successful incr
          return 0;
        }
      }
      // retry
    }
  }

  /// Returns DECR if some amount was decremented, with that amount
  /// subtracted from n.  If n is 1 and this function returns DECR then n
  /// must be 0 afterward.  Returns PUSH if no value could be decremented
  /// and idx was pushed, or if idx was zero and no push was performed but
  /// a push would have been performed with a valid node.  Returns SHUTDOWN
  /// if the caller should have blocked but isShutdown().  If idx == 0,
  /// may return PUSH even after isShutdown() or may return SHUTDOWN
  WaitResult decrOrPush(uint32_t& n, uint32_t idx) {
    assert(n > 0);

    while (true) {
      auto head = head_->load(std::memory_order_acquire);

      if (head.isLocked()) {
        std::this_thread::yield();
        continue;
      }

      if (!head.isNodeIdx() && head.value() > 0) {
        // decr
        auto delta = std::min(n, head.value());
        if (head_->compare_exchange_strong(head, head.withValueDecr(delta))) {
          n -= delta;
          return WaitResult::DECR;
        }
      } else {
        // push
        if (idx == 0) {
          return WaitResult::PUSH;
        }

        if (UNLIKELY(head.isShutdown())) {
          return WaitResult::SHUTDOWN;
        }

        auto& node = idxToNode(idx);
        node.next.store(
            head.isNodeIdx() ? head.idx() : 0, std::memory_order_relaxed);
        if (head_->compare_exchange_strong(head, head.withPush(idx))) {
          // push succeeded
          return WaitResult::PUSH;
        }
      }
    }
    // retry
  }
};

} // namespace detail

template <template <typename> class Atom, class BatonType>
struct LifoSemImpl : public detail::LifoSemBase<BatonType, Atom> {
  constexpr explicit LifoSemImpl(uint32_t v = 0)
      : detail::LifoSemBase<BatonType, Atom>(v) {}
};

} // namespace folly