/*
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cstddef>
#include <cstdint>
#include <cstring>

#include <array>
#include <iterator>
#include <limits>
#include <memory>
#include <new>
#include <type_traits>
#include <utility>
#include <vector>

#if FOLLY_HAS_STRING_VIEW
#include <string_view> // @manual
#endif

#include <folly/Bits.h>
#include <folly/ConstexprMath.h>
#include <folly/Likely.h>
#include <folly/Portability.h>
#include <folly/ScopeGuard.h>
#include <folly/Traits.h>
#include <folly/functional/Invoke.h>
#include <folly/lang/Align.h>
#include <folly/lang/Assume.h>
#include <folly/lang/Exception.h>
#include <folly/lang/Launder.h>
#include <folly/lang/Pretty.h>
#include <folly/lang/SafeAssert.h>
#include <folly/portability/Builtins.h>

#include <folly/container/HeterogeneousAccess.h>
#include <folly/container/detail/F14Defaults.h>
#include <folly/container/detail/F14IntrinsicsAvailability.h>
#include <folly/container/detail/F14Mask.h>

#if FOLLY_LIBRARY_SANITIZE_ADDRESS && defined(FOLLY_TLS)
#define FOLLY_F14_TLS_IF_ASAN FOLLY_TLS
#else
#define FOLLY_F14_TLS_IF_ASAN
#endif

#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE

#if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
#if FOLLY_NEON
#include <arm_acle.h> // __crc32cd
#else
#include <nmmintrin.h> // _mm_crc32_u64
#endif
#else
#ifdef _WIN32
#include <intrin.h> // _mul128 in fallback bit mixer
#endif
#endif

#if FOLLY_NEON
#include <arm_neon.h> // uint8x16t intrinsics
#else // SSE2
#include <emmintrin.h> // _mm_set1_epi8
#include <immintrin.h> // __m128i intrinsics
#include <xmmintrin.h> // _mm_prefetch
#endif

#endif

#ifndef FOLLY_F14_PERTURB_INSERTION_ORDER
#define FOLLY_F14_PERTURB_INSERTION_ORDER folly::kIsDebug
#endif

namespace folly {

struct F14TableStats {
  char const* policy;
  std::size_t size{0};
  std::size_t valueSize{0};
  std::size_t bucketCount{0};
  std::size_t chunkCount{0};
  std::vector<std::size_t> chunkOccupancyHisto;
  std::vector<std::size_t> chunkOutboundOverflowHisto;
  std::vector<std::size_t> chunkHostedOverflowHisto;
  std::vector<std::size_t> keyProbeLengthHisto;
  std::vector<std::size_t> missProbeLengthHisto;
  std::size_t totalBytes{0};
  std::size_t overheadBytes{0};

 private:
  template <typename T>
  static auto computeHelper(T const* m) -> decltype(m->computeStats()) {
    return m->computeStats();
  }

  static F14TableStats computeHelper(...) {
    return {};
  }

 public:
  template <typename T>
  static F14TableStats compute(T const& m) {
    return computeHelper(&m);
  }
};

namespace f14 {
namespace detail {

template <F14IntrinsicsMode>
struct F14LinkCheck {};

template <>
struct F14LinkCheck<getF14IntrinsicsMode()> {
  // The purpose of this method is to trigger a link failure if
  // compilation flags vary across compilation units.  The definition
  // is in F14Table.cpp, so only one of F14LinkCheck<None>::check,
  // F14LinkCheck<Simd>::check, or F14LinkCheck<SimdAndCrc>::check will
  // be available at link time.
  //
  // To cause a link failure the function must be invoked in code that
  // is not optimized away, so we call it on a couple of cold paths
  // (exception handling paths in copy construction and rehash).  LTO may
  // remove it entirely, but that's fine.
  static void check() noexcept;
};

bool tlsPendingSafeInserts(std::ptrdiff_t delta = 0);
std::size_t tlsMinstdRand(std::size_t n);

#if defined(_LIBCPP_VERSION)

template <typename K, typename V, typename H>
struct StdNodeReplica {
  void* next;
  std::size_t hash;
  V value;
};

#else

template <typename H>
struct StdIsFastHash : std::true_type {};
template <>
struct StdIsFastHash<std::hash<long double>> : std::false_type {};
template <typename... Args>
struct StdIsFastHash<std::hash<std::basic_string<Args...>>> : std::false_type {
};
#if FOLLY_HAS_STRING_VIEW
template <typename... Args>
struct StdIsFastHash<std::hash<std::basic_string_view<Args...>>>
    : std::false_type {};
#endif

// mimic internal node of unordered containers in STL to estimate the size
template <typename K, typename V, typename H, typename Enable = void>
struct StdNodeReplica {
  void* next;
  V value;
};
template <typename K, typename V, typename H>
struct StdNodeReplica<
    K,
    V,
    H,
    std::enable_if_t<
        !StdIsFastHash<H>::value || !is_nothrow_invocable_v<H, K>>> {
  void* next;
  V value;
  std::size_t hash;
};

#endif

template <class Container, class Predicate>
void erase_if_impl(Container& c, Predicate& predicate) {
  for (auto i = c.begin(), last = c.end(); i != last;) {
    auto prev = i++;
    if (predicate(*prev)) {
      c.erase(prev);
    }
  }
}

} // namespace detail
} // namespace f14

#if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE
namespace f14 {
namespace detail {
template <typename Policy>
class F14Table;
} // namespace detail
} // namespace f14

class F14HashToken final {
 public:
  F14HashToken() = default;

 private:
  using HashPair = std::pair<std::size_t, std::size_t>;

  explicit F14HashToken(HashPair hp) : hp_(hp) {}
  explicit operator HashPair() const {
    return hp_;
  }

  HashPair hp_;

  template <typename Policy>
  friend class f14::detail::F14Table;
};

namespace f14 {
namespace detail {

//// Defaults should be selected using void
template <typename Arg, typename Default>
using VoidDefault =
    std::conditional_t<std::is_same<Arg, Default>::value, void, Arg>;

template <typename Arg, typename Default>
using Defaulted =
    std::conditional_t<std::is_same<Arg, void>::value, Default, Arg>;

template <
    typename TableKey,
    typename Hasher,
    typename KeyEqual,
    typename ArgKey>
struct EligibleForHeterogeneousFind
    : Conjunction<
          is_transparent<Hasher>,
          is_transparent<KeyEqual>,
          is_invocable<Hasher, ArgKey const&>,
          is_invocable<KeyEqual, ArgKey const&, TableKey const&>> {};

template <
    typename TableKey,
    typename Hasher,
    typename KeyEqual,
    typename ArgKey>
using EligibleForHeterogeneousInsert = Conjunction<
    EligibleForHeterogeneousFind<TableKey, Hasher, KeyEqual, ArgKey>,
    std::is_constructible<TableKey, ArgKey>>;

////////////////

template <typename T>
FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) {
#ifndef _WIN32
  __builtin_prefetch(static_cast<void const*>(ptr));
#elif FOLLY_NEON
  __prefetch(static_cast<void const*>(ptr));
#else
  _mm_prefetch(
      static_cast<char const*>(static_cast<void const*>(ptr)), _MM_HINT_T0);
#endif
}

#if FOLLY_NEON
using TagVector = uint8x16_t;
#else // SSE2
using TagVector = __m128i;
#endif

// We could use unaligned loads to relax this requirement, but that
// would be both a performance penalty and require a bulkier packed
// ItemIter format
constexpr std::size_t kRequiredVectorAlignment =
    constexpr_max(std::size_t{16}, alignof(max_align_t));

using EmptyTagVectorType = std::aligned_storage_t<
    sizeof(TagVector) + kRequiredVectorAlignment,
    alignof(max_align_t)>;

extern EmptyTagVectorType kEmptyTagVector;

template <typename ItemType>
struct alignas(kRequiredVectorAlignment) F14Chunk {
  using Item = ItemType;

  // For our 16 byte vector alignment (and assuming alignof(Item) >=
  // 4) kCapacity of 14 is the most space efficient.  Slightly smaller
  // or larger capacities can help with cache alignment in a couple of
  // cases without wasting too much space, but once the items are larger
  // then we're unlikely to get much benefit anyway.  The only case we
  // optimize is using kCapacity of 12 for 4 byte items, which makes the
  // chunk take exactly 1 cache line, and adding 16 bytes of padding for
  // 16 byte items so that a chunk takes exactly 4 cache lines.
  static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14;

  static constexpr unsigned kDesiredCapacity = kCapacity - 2;

  static constexpr unsigned kAllocatedCapacity =
      kCapacity + (sizeof(Item) == 16 ? 1 : 0);

  // If kCapacity == 12 then we get 16 bits of capacityScale by using
  // tag 12 and 13, otherwise we only get 4 bits of control_
  static constexpr std::size_t kCapacityScaleBits = kCapacity == 12 ? 16 : 4;
  static constexpr std::size_t kCapacityScaleShift = kCapacityScaleBits - 4;

  static constexpr MaskType kFullMask = FullMask<kCapacity>::value;

  // Non-empty tags have their top bit set.  tags_ array might be bigger
  // than kCapacity to keep alignment of first item.
  std::array<uint8_t, 14> tags_;

  // Bits 0..3 of chunk 0 record the scaling factor between the number of
  // chunks and the max size without rehash.  Bits 4-7 in any chunk are a
  // 4-bit counter of the number of values in this chunk that were placed
  // because they overflowed their desired chunk (hostedOverflowCount).
  uint8_t control_;

  // The number of values that would have been placed into this chunk if
  // there had been space, including values that also overflowed previous
  // full chunks.  This value saturates; once it becomes 255 it no longer
  // increases nor decreases.
  uint8_t outboundOverflowCount_;

  std::array<aligned_storage_for_t<Item>, kAllocatedCapacity> rawItems_;

  static F14Chunk* emptyInstance() {
    auto raw = reinterpret_cast<char*>(&kEmptyTagVector);
    if (kRequiredVectorAlignment > alignof(max_align_t)) {
      auto delta = kRequiredVectorAlignment -
          (reinterpret_cast<uintptr_t>(raw) % kRequiredVectorAlignment);
      raw += delta;
    }
    auto rv = reinterpret_cast<F14Chunk*>(raw);
    FOLLY_SAFE_DCHECK(
        (reinterpret_cast<uintptr_t>(rv) % kRequiredVectorAlignment) == 0, "");
    return rv;
  }

  void clear() {
    // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0;

    // gcc < 6 doesn't exploit chunk alignment to generate the optimal
    // SSE clear from memset.  This is very hot code, so it is worth
    // handling that case specially.
#if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__
    // this doesn't violate strict aliasing rules because __m128i is
    // tagged as __may_alias__
    auto* v = static_cast<__m128i*>(static_cast<void*>(&tags_[0]));
    _mm_store_si128(v, _mm_setzero_si128());
#else
    std::memset(&tags_[0], '\0', 16);
#endif
  }

  void copyOverflowInfoFrom(F14Chunk const& rhs) {
    FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, "");
    control_ += static_cast<uint8_t>(rhs.control_ & 0xf0);
    outboundOverflowCount_ = rhs.outboundOverflowCount_;
  }

  unsigned hostedOverflowCount() const {
    return control_ >> 4;
  }

  static constexpr uint8_t kIncrHostedOverflowCount = 0x10;
  static constexpr uint8_t kDecrHostedOverflowCount =
      static_cast<uint8_t>(-0x10);

  void adjustHostedOverflowCount(uint8_t op) {
    control_ += op;
  }

  bool eof() const {
    return capacityScale() != 0;
  }

  std::size_t capacityScale() const {
    if (kCapacityScaleBits == 4) {
      return control_ & 0xf;
    } else {
      uint16_t v;
      std::memcpy(&v, &tags_[12], 2);
      return v;
    }
  }

  void setCapacityScale(std::size_t scale) {
    FOLLY_SAFE_DCHECK(
        this != emptyInstance() && scale > 0 &&
            scale < (std::size_t{1} << kCapacityScaleBits),
        "");
    if (kCapacityScaleBits == 4) {
      control_ = static_cast<uint8_t>((control_ & ~0xf) | scale);
    } else {
      uint16_t v = static_cast<uint16_t>(scale);
      std::memcpy(&tags_[12], &v, 2);
    }
  }

  void markEof(std::size_t scale) {
    folly::assume(control_ == 0);
    setCapacityScale(scale);
  }

  unsigned outboundOverflowCount() const {
    return outboundOverflowCount_;
  }

  void incrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      ++outboundOverflowCount_;
    }
  }

  void decrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      --outboundOverflowCount_;
    }
  }

  std::size_t tag(std::size_t index) const {
    return tags_[index];
  }

  void setTag(std::size_t index, std::size_t tag) {
    FOLLY_SAFE_DCHECK(
        this != emptyInstance() && tag >= 0x80 && tag <= 0xff, "");
    FOLLY_SAFE_CHECK(tags_[index] == 0, "");
    tags_[index] = static_cast<uint8_t>(tag);
  }

  void clearTag(std::size_t index) {
    FOLLY_SAFE_CHECK((tags_[index] & 0x80) != 0, "");
    tags_[index] = 0;
  }

#if FOLLY_NEON
  ////////
  // Tag filtering using NEON intrinsics

  SparseMaskIter tagMatchIter(std::size_t needle) const {
    FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
    uint8x16_t tagV = vld1q_u8(&tags_[0]);
    auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle));
    auto eqV = vceqq_u8(tagV, needleV);
    // get info from every byte into the bottom half of every uint16_t
    // by shifting right 4, then round to get it into a 64-bit vector
    uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4);
    uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
    return SparseMaskIter(mask);
  }

  MaskType occupiedMask() const {
    uint8x16_t tagV = vld1q_u8(&tags_[0]);
    // signed shift extends top bit to all bits
    auto occupiedV =
        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7));
    uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4);
    return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask;
  }
#else
  ////////
  // Tag filtering using SSE2 intrinsics

  TagVector const* tagVector() const {
    return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
  }

  SparseMaskIter tagMatchIter(std::size_t needle) const {
    FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
    auto tagV = _mm_load_si128(tagVector());

    // TRICKY!  It may seem strange to have a std::size_t needle and narrow
    // it at the last moment, rather than making HashPair::second be a
    // uint8_t, but the latter choice sometimes leads to a performance
    // problem.
    //
    // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
    // to multiple instructions.  One of those is a MOVD of either 4 or
    // 8 byte width.  Only the bottom byte of that move actually affects
    // the result, but if a 1-byte needle has been spilled then this will
    // be a 4 byte load.  GCC 5.5 has been observed to reload needle
    // (or perhaps fuse a reload and part of a previous static_cast)
    // needle using a MOVZX with a 1 byte load in parallel with the MOVD.
    // This combination causes a failure of store-to-load forwarding,
    // which has a big performance penalty (60 nanoseconds per find on
    // a microbenchmark).  Keeping needle >= 4 bytes avoids the problem
    // and also happens to result in slightly more compact assembly.
    auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle));
    auto eqV = _mm_cmpeq_epi8(tagV, needleV);
    auto mask = _mm_movemask_epi8(eqV) & kFullMask;
    return SparseMaskIter{mask};
  }

  MaskType occupiedMask() const {
    auto tagV = _mm_load_si128(tagVector());
    return _mm_movemask_epi8(tagV) & kFullMask;
  }
#endif

  DenseMaskIter occupiedIter() const {
    return DenseMaskIter{&tags_[0], occupiedMask()};
  }

  MaskRangeIter occupiedRangeIter() const {
    return MaskRangeIter{occupiedMask()};
  }

  LastOccupiedInMask lastOccupied() const {
    return LastOccupiedInMask{occupiedMask()};
  }

  FirstEmptyInMask firstEmpty() const {
    return FirstEmptyInMask{occupiedMask() ^ kFullMask};
  }

  bool occupied(std::size_t index) const {
    FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, "");
    return tags_[index] != 0;
  }

  Item* itemAddr(std::size_t i) const {
    return static_cast<Item*>(
        const_cast<void*>(static_cast<void const*>(&rawItems_[i])));
  }

  Item& item(std::size_t i) {
    FOLLY_SAFE_DCHECK(this->occupied(i), "");
    return *launder(itemAddr(i));
  }

  Item const& citem(std::size_t i) const {
    FOLLY_SAFE_DCHECK(this->occupied(i), "");
    return *launder(itemAddr(i));
  }

  static F14Chunk& owner(Item& item, std::size_t index) {
    auto rawAddr =
        static_cast<uint8_t*>(static_cast<void*>(std::addressof(item))) -
        offsetof(F14Chunk, rawItems_) - index * sizeof(Item);
    auto chunkAddr = static_cast<F14Chunk*>(static_cast<void*>(rawAddr));
    FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), "");
    return *chunkAddr;
  }
};

////////////////

// PackedChunkItemPtr points to an Item in an F14Chunk, allowing both the
// Item& and its index to be recovered.  It sorts by the address of the
// item, and it only works for items that are in a properly-aligned chunk.

// generic form, not actually packed
template <typename Ptr>
class PackedChunkItemPtr {
 public:
  PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} {
    FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, "");
  }

  Ptr ptr() const {
    return ptr_;
  }

  std::size_t index() const {
    return index_;
  }

  bool operator<(PackedChunkItemPtr const& rhs) const {
    FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "");
    return ptr_ < rhs.ptr_;
  }

  bool operator==(PackedChunkItemPtr const& rhs) const {
    FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, "");
    return ptr_ == rhs.ptr_;
  }

  bool operator!=(PackedChunkItemPtr const& rhs) const {
    return !(*this == rhs);
  }

 private:
  Ptr ptr_;
  std::size_t index_;
};

// Bare pointer form, packed into a uintptr_t.  Uses only bits wasted by
// alignment, so it works on 32-bit and 64-bit platforms
template <typename T>
class PackedChunkItemPtr<T*> {
  static_assert((alignof(F14Chunk<T>) % 16) == 0, "");

  // Chunks are 16-byte aligned, so we can maintain a packed pointer to a
  // chunk item by packing the 4-bit item index into the least significant
  // bits of a pointer to the chunk itself.  This makes ItemIter::pack
  // more expensive, however, since it has to compute the chunk address.
  //
  // Chunk items have varying alignment constraints, so it would seem
  // to be that we can't do a similar trick while using only bit masking
  // operations on the Item* itself.  It happens to be, however, that if
  // sizeof(Item) is not a multiple of 16 then we can recover a portion
  // of the index bits from the knowledge that the Item-s are stored in
  // an array that is itself 16-byte aligned.
  //
  // If kAlignBits is the number of trailing zero bits in sizeof(Item)
  // (up to 4), then we can borrow those bits to store kAlignBits of the
  // index directly.  We can recover (4 - kAlignBits) bits of the index
  // from the item pointer itself, by defining/observing that
  //
  // A = kAlignBits                  (A <= 4)
  //
  // S = (sizeof(Item) % 16) >> A    (shifted-away bits are all zero)
  //
  // R = (itemPtr % 16) >> A         (shifted-away bits are all zero)
  //
  // M = 16 >> A
  //
  // itemPtr % 16   = (index * sizeof(Item)) % 16
  //
  // (R * 2^A) % 16 = (index * (sizeof(Item) % 16)) % 16
  //
  // (R * 2^A) % 16 = (index * 2^A * S) % 16
  //
  // R % M          = (index * S) % M
  //
  // S is relatively prime with M, so a multiplicative inverse is easy
  // to compute
  //
  // Sinv = S^(M - 1) % M
  //
  // (R * Sinv) % M = index % M
  //
  // This lets us recover the bottom bits of the index.  When sizeof(T)
  // is 8-byte aligned kSizeInverse will always be 1.  When sizeof(T)
  // is 4-byte aligned kSizeInverse will be either 1 or 3.

  // returns pow(x, y) % m
  static constexpr uintptr_t powerMod(uintptr_t x, uintptr_t y, uintptr_t m) {
    return y == 0 ? 1 : (x * powerMod(x, y - 1, m)) % m;
  }

  static constexpr uintptr_t kIndexBits = 4;
  static constexpr uintptr_t kIndexMask = (uintptr_t{1} << kIndexBits) - 1;

  static constexpr uintptr_t kAlignBits = constexpr_min(
      uintptr_t{4},
      constexpr_find_first_set(uintptr_t{sizeof(T)}) - 1);

  static constexpr uintptr_t kAlignMask = (uintptr_t{1} << kAlignBits) - 1;

  static constexpr uintptr_t kModulus = uintptr_t{1}
      << (kIndexBits - kAlignBits);
  static constexpr uintptr_t kSizeInverse =
      powerMod(sizeof(T) >> kAlignBits, kModulus - 1, kModulus);

 public:
  PackedChunkItemPtr(T* p, std::size_t i) noexcept {
    uintptr_t encoded = i >> (kIndexBits - kAlignBits);
    assume((encoded & ~kAlignMask) == 0);
    raw_ = reinterpret_cast<uintptr_t>(p) | encoded;
    FOLLY_SAFE_DCHECK(p == ptr(), "");
    FOLLY_SAFE_DCHECK(i == index(), "");
  }

  T* ptr() const {
    return reinterpret_cast<T*>(raw_ & ~kAlignMask);
  }

  std::size_t index() const {
    auto encoded = (raw_ & kAlignMask) << (kIndexBits - kAlignBits);
    auto deduced =
        ((raw_ >> kAlignBits) * kSizeInverse) & (kIndexMask >> kAlignBits);
    return encoded | deduced;
  }

  bool operator<(PackedChunkItemPtr const& rhs) const {
    return raw_ < rhs.raw_;
  }
  bool operator==(PackedChunkItemPtr const& rhs) const {
    return raw_ == rhs.raw_;
  }
  bool operator!=(PackedChunkItemPtr const& rhs) const {
    return !(*this == rhs);
  }

 private:
  uintptr_t raw_;
};

template <typename ChunkPtr>
class F14ItemIter {
 private:
  using Chunk = typename std::pointer_traits<ChunkPtr>::element_type;

 public:
  using Item = typename Chunk::Item;
  using ItemPtr = typename std::pointer_traits<ChunkPtr>::template rebind<Item>;
  using ItemConstPtr =
      typename std::pointer_traits<ChunkPtr>::template rebind<Item const>;

  using Packed = PackedChunkItemPtr<ItemPtr>;

  //// PUBLIC

  F14ItemIter() noexcept : itemPtr_{nullptr}, index_{0} {}

  // default copy and move constructors and assignment operators are correct

  explicit F14ItemIter(Packed const& packed)
      : itemPtr_{packed.ptr()}, index_{packed.index()} {}

  F14ItemIter(ChunkPtr chunk, std::size_t index)
      : itemPtr_{std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index))},
        index_{index} {
    FOLLY_SAFE_DCHECK(index < Chunk::kCapacity, "");
    assume(
        std::pointer_traits<ItemPtr>::pointer_to(chunk->item(index)) !=
        nullptr);
    assume(itemPtr_ != nullptr);
  }

  FOLLY_ALWAYS_INLINE void advanceImpl(bool checkEof, bool likelyDead) {
    auto c = chunk();

    // common case is packed entries
    while (index_ > 0) {
      --index_;
      --itemPtr_;
      if (LIKELY(c->occupied(index_))) {
        return;
      }
    }

    // It's fairly common for an iterator to be advanced and then become
    // dead, for example in the return value from erase(iter) or in
    // the last step of a loop.  We'd like to make sure that the entire
    // advance() method can be eliminated by the compiler's dead code
    // elimination pass.  To do that it must eliminate the loops, which
    // requires it to prove that they have no side effects.  It's easy
    // to show that there are no escaping stores, but at the moment
    // compilers also consider an infinite loop to be a side effect.
    // (There are parts of the standard that would allow them to treat
    // this as undefined behavior, but at the moment they don't exploit
    // those clauses.)
    //
    // The following loop should really be a while loop, which would
    // save a register, some instructions, and a conditional branch,
    // but by writing it as a for loop the compiler can prove to itself
    // that it will eventually terminate.  (No matter that even if the
    // loop executed in a single cycle it would take about 200 years to
    // run all 2^64 iterations.)
    //
    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82776 has the bug we
    // filed about the issue.  while (true) {
    for (std::size_t i = 1; !likelyDead || i != 0; ++i) {
      if (checkEof) {
        // exhausted the current chunk
        if (UNLIKELY(c->eof())) {
          FOLLY_SAFE_DCHECK(index_ == 0, "");
          itemPtr_ = nullptr;
          return;
        }
      } else {
        FOLLY_SAFE_DCHECK(!c->eof(), "");
      }
      --c;
      auto last = c->lastOccupied();
      if (checkEof && !likelyDead) {
        prefetchAddr(&*c - 1);
      }
      if (LIKELY(last.hasIndex())) {
        index_ = last.index();
        itemPtr_ = std::pointer_traits<ItemPtr>::pointer_to(c->item(index_));
        return;
      }
    }
  }

  void precheckedAdvance() {
    advanceImpl(false, false);
  }

  FOLLY_ALWAYS_INLINE void advance() {
    advanceImpl(true, false);
  }

  FOLLY_ALWAYS_INLINE void advanceLikelyDead() {
    advanceImpl(true, true);
  }

  ChunkPtr chunk() const {
    return std::pointer_traits<ChunkPtr>::pointer_to(
        Chunk::owner(*itemPtr_, index_));
  }

  std::size_t index() const {
    return index_;
  }

  Item* itemAddr() const {
    return std::addressof(*itemPtr_);
  }
  Item& item() const {
    return *itemPtr_;
  }
  Item const& citem() const {
    return *itemPtr_;
  }

  bool atEnd() const {
    return itemPtr_ == nullptr;
  }

  Packed pack() const {
    return Packed{itemPtr_, static_cast<uint8_t>(index_)};
  }

  bool operator==(F14ItemIter const& rhs) const {
    // this form makes iter == end() into a single null check after inlining
    // and constant propagation
    return itemPtr_ == rhs.itemPtr_;
  }

  bool operator!=(F14ItemIter const& rhs) const {
    return !(*this == rhs);
  }

 private:
  ItemPtr itemPtr_;
  std::size_t index_;
};

////////////////

template <typename SizeType, typename ItemIter, bool EnablePackedItemIter>
struct SizeAndPackedBegin {
  SizeType size_{0};

 private:
  typename ItemIter::Packed packedBegin_{ItemIter{}.pack()};

 public:
  typename ItemIter::Packed& packedBegin() {
    return packedBegin_;
  }

  typename ItemIter::Packed const& packedBegin() const {
    return packedBegin_;
  }
};

template <typename SizeType, typename ItemIter>
struct SizeAndPackedBegin<SizeType, ItemIter, false> {
  SizeType size_{0};

  [[noreturn]] typename ItemIter::Packed& packedBegin() {
    assume_unreachable();
  }

  [[noreturn]] typename ItemIter::Packed const& packedBegin() const {
    assume_unreachable();
  }
};

template <typename Policy>
class F14Table : public Policy {
 public:
  using Item = typename Policy::Item;

  using value_type = typename Policy::Value;
  using allocator_type = typename Policy::Alloc;

 private:
  using Alloc = typename Policy::Alloc;
  using AllocTraits = typename Policy::AllocTraits;
  using Hasher = typename Policy::Hasher;
  using InternalSizeType = typename Policy::InternalSizeType;
  using KeyEqual = typename Policy::KeyEqual;

  using Policy::kAllocIsAlwaysEqual;
  using Policy::kContinuousCapacity;
  using Policy::kDefaultConstructIsNoexcept;
  using Policy::kEnableItemIteration;
  using Policy::kSwapIsNoexcept;

  using Policy::destroyItemOnClear;
  using Policy::isAvalanchingHasher;
  using Policy::prefetchBeforeCopy;
  using Policy::prefetchBeforeDestroy;
  using Policy::prefetchBeforeRehash;

  using ByteAlloc = typename AllocTraits::template rebind_alloc<uint8_t>;
  using BytePtr = typename std::allocator_traits<ByteAlloc>::pointer;

  using Chunk = F14Chunk<Item>;
  using ChunkPtr =
      typename std::pointer_traits<BytePtr>::template rebind<Chunk>;

  using HashPair = typename F14HashToken::HashPair;

 public:
  using ItemIter = F14ItemIter<ChunkPtr>;

 private:
  //////// begin fields

  ChunkPtr chunks_{Chunk::emptyInstance()};
  InternalSizeType chunkMask_{0};
  SizeAndPackedBegin<InternalSizeType, ItemIter, kEnableItemIteration>
      sizeAndPackedBegin_;

  //////// end fields

  void swapContents(F14Table& rhs) noexcept {
    using std::swap;
    swap(chunks_, rhs.chunks_);
    swap(chunkMask_, rhs.chunkMask_);
    swap(sizeAndPackedBegin_.size_, rhs.sizeAndPackedBegin_.size_);
    if (kEnableItemIteration) {
      swap(
          sizeAndPackedBegin_.packedBegin(),
          rhs.sizeAndPackedBegin_.packedBegin());
    }
  }

 public:
  F14Table(
      std::size_t initialCapacity,
      Hasher const& hasher,
      KeyEqual const& keyEqual,
      Alloc const& alloc)
      : Policy{hasher, keyEqual, alloc} {
    if (initialCapacity > 0) {
      reserve(initialCapacity);
    }
  }

  F14Table(F14Table const& rhs) : Policy{rhs} {
    buildFromF14Table(rhs);
  }

  F14Table(F14Table const& rhs, Alloc const& alloc) : Policy{rhs, alloc} {
    buildFromF14Table(rhs);
  }

  F14Table(F14Table&& rhs) noexcept(
      std::is_nothrow_move_constructible<Hasher>::value&&
          std::is_nothrow_move_constructible<KeyEqual>::value&&
              std::is_nothrow_move_constructible<Alloc>::value)
      : Policy{std::move(rhs)} {
    swapContents(rhs);
  }

  F14Table(F14Table&& rhs, Alloc const& alloc) noexcept(kAllocIsAlwaysEqual)
      : Policy{std::move(rhs), alloc} {
    if (kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) {
      // move storage (common case)
      swapContents(rhs);
    } else {
      // new storage because allocators unequal, move values (rare case)
      buildFromF14Table(std::move(rhs));
    }
  }

  F14Table& operator=(F14Table const& rhs) {
    if (this != &rhs) {
      reset();
      static_cast<Policy&>(*this) = rhs;
      buildFromF14Table(rhs);
    }
    return *this;
  }

  F14Table& operator=(F14Table&& rhs) noexcept(
      std::is_nothrow_move_assignable<Hasher>::value&&
          std::is_nothrow_move_assignable<KeyEqual>::value &&
      (kAllocIsAlwaysEqual ||
       (AllocTraits::propagate_on_container_move_assignment::value &&
        std::is_nothrow_move_assignable<Alloc>::value))) {
    if (this != &rhs) {
      reset();
      static_cast<Policy&>(*this) = std::move(rhs);
      if (AllocTraits::propagate_on_container_move_assignment::value ||
          kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) {
        // move storage (common case)
        swapContents(rhs);
      } else {
        // new storage because allocators unequal, move values (rare case)
        buildFromF14Table(std::move(rhs));
      }
    }
    return *this;
  }

  ~F14Table() {
    reset();
  }

  void swap(F14Table& rhs) noexcept(kSwapIsNoexcept) {
    // If propagate_on_container_swap is false and allocators are
    // not equal, the only way to accomplish a swap would be to do
    // dynamic allocation and then move (or swap) each contained value.
    // AllocatorAwareContainer-s are not supposed to attempt this, but
    // rather are supposed to have undefined behavior in that case.
    FOLLY_SAFE_CHECK(
        AllocTraits::propagate_on_container_swap::value ||
            kAllocIsAlwaysEqual || this->alloc() == rhs.alloc(),
        "swap is undefined for unequal non-propagating allocators");
    this->swapPolicy(rhs);
    swapContents(rhs);
  }

 private:
  //////// hash helpers

  // Hash values are used to compute the desired position, which is the
  // chunk index at which we would like to place a value (if there is no
  // overflow), and the tag, which is an additional 7 bits of entropy.
  //
  // The standard's definition of hash function quality only refers to
  // the probability of collisions of the entire hash value, not to the
  // probability of collisions of the results of shifting or masking the
  // hash value.  Some hash functions, however, provide this stronger
  // guarantee (not quite the same as the definition of avalanching,
  // but similar).
  //
  // If the user-supplied hasher is an avalanching one (each bit of the
  // hash value has a 50% chance of being the same for differing hash
  // inputs), then we can just take 7 bits of the hash value for the tag
  // and the rest for the desired position.  Avalanching hashers also
  // let us map hash value to array index position with just a bitmask
  // without risking clumping.  (Many hash tables just accept the risk
  // and do it regardless.)
  //
  // std::hash<std::string> avalanches in all implementations we've
  // examined: libstdc++-v3 uses MurmurHash2, and libc++ uses CityHash
  // or MurmurHash2.  The other std::hash specializations, however, do not
  // have this property.  std::hash for integral and pointer values is the
  // identity function on libstdc++-v3 and libc++, in particular.  In our
  // experience it is also fairly common for user-defined specializations
  // of std::hash to combine fields in an ad-hoc way that does not evenly
  // distribute entropy among the bits of the result (a + 37 * b, for
  // example, where a and b are integer fields).
  //
  // For hash functions we don't trust to avalanche, we repair things by
  // applying a bit mixer to the user-supplied hash.

#if FOLLY_X64 || FOLLY_AARCH64
  // 64-bit
  static HashPair splitHash(std::size_t hash) {
    static_assert(sizeof(std::size_t) == sizeof(uint64_t), "");
    std::size_t tag;
    if (!isAvalanchingHasher()) {
#if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
#if FOLLY_SSE_PREREQ(4, 2)
      // SSE4.2 CRC
      std::size_t c = _mm_crc32_u64(0, hash);
      tag = (c >> 24) | 0x80;
      hash += c;
#else
      // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1
      std::size_t c = __crc32cd(0, hash);
      tag = (c >> 24) | 0x80;
      hash += c;
#endif
#else
      // The mixer below is not fully avalanching for all 64 bits of
      // output, but looks quite good for bits 18..63 and puts plenty
      // of entropy even lower when considering multiple bits together
      // (like the tag).  Importantly, when under register pressure it
      // uses fewer registers, instructions, and immediate constants
      // than the alternatives, resulting in compact code that is more
      // easily inlinable.  In one instantiation a modified Murmur mixer
      // was 48 bytes of assembly (even after using the same multiplicand
      // for both steps) and this one was 27 bytes, for example.
      auto const kMul = 0xc4ceb9fe1a85ec53ULL;
#ifdef _WIN32
      __int64 signedHi;
      __int64 signedLo = _mul128(
          static_cast<__int64>(hash), static_cast<__int64>(kMul), &signedHi);
      auto hi = static_cast<uint64_t>(signedHi);
      auto lo = static_cast<uint64_t>(signedLo);
#else
      auto hi = static_cast<uint64_t>(
          (static_cast<unsigned __int128>(hash) * kMul) >> 64);
      auto lo = hash * kMul;
#endif
      hash = hi ^ lo;
      hash *= kMul;
      tag = ((hash >> 15) & 0x7f) | 0x80;
      hash >>= 22;
#endif
    } else {
      // we don't trust the top bit
      tag = (hash >> 56) | 0x80;
    }
    return std::make_pair(hash, tag);
  }
#else
  // 32-bit
  static HashPair splitHash(std::size_t hash) {
    static_assert(sizeof(std::size_t) == sizeof(uint32_t), "");
    uint8_t tag;
    if (!isAvalanchingHasher()) {
#if FOLLY_F14_CRC_INTRINSIC_AVAILABLE
#if FOLLY_SSE_PREREQ(4, 2)
      // SSE4.2 CRC
      auto c = _mm_crc32_u32(0, hash);
      tag = static_cast<uint8_t>(~(c >> 25));
      hash += c;
#else
      auto c = __crc32cw(0, hash);
      tag = static_cast<uint8_t>(~(c >> 25));
      hash += c;
#endif
#else
      // finalizer for 32-bit murmur2
      hash ^= hash >> 13;
      hash *= 0x5bd1e995;
      hash ^= hash >> 15;
      tag = static_cast<uint8_t>(~(hash >> 25));
#endif
    } else {
      // we don't trust the top bit
      tag = (hash >> 24) | 0x80;
    }
    return std::make_pair(hash, tag);
  }
#endif

  //////// memory management helpers

  static std::size_t computeCapacity(
      std::size_t chunkCount,
      std::size_t scale) {
    FOLLY_SAFE_DCHECK(!(chunkCount > 1 && scale == 0), "");
    FOLLY_SAFE_DCHECK(
        scale < (std::size_t{1} << Chunk::kCapacityScaleBits), "");
    FOLLY_SAFE_DCHECK((chunkCount & (chunkCount - 1)) == 0, "");
    return (((chunkCount - 1) >> Chunk::kCapacityScaleShift) + 1) * scale;
  }

  std::pair<std::size_t, std::size_t> computeChunkCountAndScale(
      std::size_t desiredCapacity,
      bool continuousSingleChunkCapacity,
      bool continuousMultiChunkCapacity) const {
    if (desiredCapacity <= Chunk::kCapacity) {
      // we can go to 100% capacity in a single chunk with no problem
      if (!continuousSingleChunkCapacity) {
        if (desiredCapacity <= 2) {
          desiredCapacity = 2;
        } else if (desiredCapacity <= 6) {
          desiredCapacity = 6;
        } else {
          desiredCapacity = Chunk::kCapacity;
        }
      }
      auto rv = std::make_pair(std::size_t{1}, desiredCapacity);
      FOLLY_SAFE_DCHECK(
          computeCapacity(rv.first, rv.second) == desiredCapacity, "");
      return rv;
    } else {
      std::size_t minChunks =
          (desiredCapacity - 1) / Chunk::kDesiredCapacity + 1;
      std::size_t chunkPow = findLastSet(minChunks - 1);
      if (chunkPow == 8 * sizeof(std::size_t)) {
        throw_exception<std::bad_alloc>();
      }

      std::size_t chunkCount = std::size_t{1} << chunkPow;

      // Let cc * scale be the actual capacity.
      // cc = ((chunkCount - 1) >> kCapacityScaleShift) + 1.
      // If chunkPow >= kCapacityScaleShift, then cc = chunkCount >>
      // kCapacityScaleShift = 1 << (chunkPow - kCapacityScaleShift),
      // otherwise it equals 1 = 1 << 0.  Let cc = 1 << ss.
      std::size_t ss = chunkPow >= Chunk::kCapacityScaleShift
          ? chunkPow - Chunk::kCapacityScaleShift
          : 0;

      std::size_t scale;
      if (continuousMultiChunkCapacity) {
        // (1 << ss) * scale >= desiredCapacity
        scale = ((desiredCapacity - 1) >> ss) + 1;
      } else {
        // (1 << ss) * scale == chunkCount * kDesiredCapacity
        scale = Chunk::kDesiredCapacity << (chunkPow - ss);
      }

      std::size_t actualCapacity = computeCapacity(chunkCount, scale);
      FOLLY_SAFE_DCHECK(actualCapacity >= desiredCapacity, "");
      if (actualCapacity > max_size()) {
        throw_exception<std::bad_alloc>();
      }

      return std::make_pair(chunkCount, scale);
    }
  }

  static std::size_t chunkAllocSize(
      std::size_t chunkCount,
      std::size_t capacityScale) {
    FOLLY_SAFE_DCHECK(chunkCount > 0, "");
    FOLLY_SAFE_DCHECK(!(chunkCount > 1 && capacityScale == 0), "");
    if (chunkCount == 1) {
      static_assert(offsetof(Chunk, rawItems_) == 16, "");
      return 16 + sizeof(Item) * computeCapacity(1, capacityScale);
    } else {
      return sizeof(Chunk) * chunkCount;
    }
  }

  ChunkPtr initializeChunks(
      BytePtr raw,
      std::size_t chunkCount,
      std::size_t capacityScale) {
    static_assert(std::is_trivial<Chunk>::value, "F14Chunk should be POD");
    auto chunks = static_cast<Chunk*>(static_cast<void*>(&*raw));
    for (std::size_t i = 0; i < chunkCount; ++i) {
      chunks[i].clear();
    }
    chunks[0].markEof(capacityScale);
    return std::pointer_traits<ChunkPtr>::pointer_to(*chunks);
  }

  std::size_t itemCount() const noexcept {
    if (chunkMask_ == 0) {
      return computeCapacity(1, chunks_->capacityScale());
    } else {
      return (chunkMask_ + 1) * Chunk::kCapacity;
    }
  }

 public:
  ItemIter begin() const noexcept {
    FOLLY_SAFE_DCHECK(kEnableItemIteration, "");
    return ItemIter{sizeAndPackedBegin_.packedBegin()};
  }

  ItemIter end() const noexcept {
    return ItemIter{};
  }

  bool empty() const noexcept {
    return size() == 0;
  }

  InternalSizeType size() const noexcept {
    return sizeAndPackedBegin_.size_;
  }

  std::size_t max_size() const noexcept {
    auto& a = this->alloc();
    return std::min<std::size_t>(
        (std::numeric_limits<InternalSizeType>::max)(),
        AllocTraits::max_size(a));
  }

  std::size_t bucket_count() const noexcept {
    return computeCapacity(chunkMask_ + 1, chunks_->capacityScale());
  }

  std::size_t max_bucket_count() const noexcept {
    return max_size();
  }

  float load_factor() const noexcept {
    return empty()
        ? 0.0f
        : static_cast<float>(size()) / static_cast<float>(bucket_count());
  }

  float max_load_factor() const noexcept {
    return 1.0f;
  }

  void max_load_factor(float) noexcept {
    // Probing hash tables can't run load factors >= 1 (unlike chaining
    // tables).  In addition, we have measured that there is little or
    // no performance advantage to running a smaller load factor (cache
    // locality losses outweigh the small reduction in probe lengths,
    // often making it slower).  Therefore, we've decided to just fix
    // max_load_factor at 1.0f regardless of what the user requests.
    // This has an additional advantage that we don't have to store it.
    // Taking alignment into consideration this makes every F14 table
    // 8 bytes smaller, and is part of the reason an empty F14NodeMap
    // is almost half the size of an empty std::unordered_map (32 vs
    // 56 bytes).
    //
    // I don't have a strong opinion on whether we should remove this
    // method or leave a stub, let ngbronson or xshi know if you have a
    // compelling argument either way.
  }

 private:
  // Our probe strategy is to advance through additional chunks with
  // a stride that is key-specific.  This is called double hashing,
  // and is a well known and high quality probing strategy.  So long as
  // the stride and the chunk count are relatively prime, we will visit
  // every chunk once and then return to the original chunk, letting us
  // detect and end the cycle.  The chunk count is a power of two, so
  // we can satisfy the relatively prime part by choosing an odd stride.
  // We've already computed a high quality secondary hash value for the
  // tag, so we just use it for the second probe hash as well.
  //
  // At the maximum load factor of 12/14, expected probe length for a
  // find hit is 1.041, with 99% of keys found in the first three chunks.
  // Expected probe length for a find miss (or insert) is 1.275, with a
  // p99 probe length of 4 (fewer than 1% of failing find look at 5 or
  // more chunks).
  //
  // This code is structured so you can try various ways of encoding
  // the current probe state.  For example, at the moment the probe's
  // state is the position in the cycle and the resulting chunk index is
  // computed from that inside probeCurrentIndex.  We could also make the
  // probe state the chunk index, and then increment it by hp.second *
  // 2 + 1 in probeAdvance.  Wrapping can be applied early or late as
  // well.  This particular code seems to be easier for the optimizer
  // to understand.
  //
  // We could also implement probing strategies that resulted in the same
  // tour for every key initially assigned to a chunk (linear probing or
  // quadratic), but that results in longer probe lengths.  In particular,
  // the cache locality wins of linear probing are not worth the increase
  // in probe lengths (extra work and less branch predictability) in
  // our experiments.

  std::size_t probeDelta(HashPair hp) const {
    return 2 * hp.second + 1;
  }

  template <typename K>
  FOLLY_ALWAYS_INLINE ItemIter findImpl(HashPair hp, K const& key) const {
    std::size_t index = hp.first;
    std::size_t step = probeDelta(hp);
    for (std::size_t tries = 0; tries <= chunkMask_; ++tries) {
      ChunkPtr chunk = chunks_ + (index & chunkMask_);
      if (sizeof(Chunk) > 64) {
        prefetchAddr(chunk->itemAddr(8));
      }
      auto hits = chunk->tagMatchIter(hp.second);
      while (hits.hasNext()) {
        auto i = hits.next();
        if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) {
          // Tag match and key match were both successful.  The chance
          // of a false tag match is 1/128 for each key in the chunk
          // (with a proper hash function).
          return ItemIter{chunk, i};
        }
      }
      if (LIKELY(chunk->outboundOverflowCount() == 0)) {
        // No keys that wanted to be placed in this chunk were denied
        // entry, so our search is over.  This is the common case.
        break;
      }
      index += step;
    }
    // Loop exit because tries is exhausted is rare, but possible.
    // That means that for every chunk there is currently a key present
    // in the map that visited that chunk on its probe search but ended
    // up somewhere else, and we have searched every chunk.
    return ItemIter{};
  }

 public:
  // Prehashing splits the work of find(key) into two calls, enabling you
  // to manually implement loop pipelining for hot bulk lookups.  prehash
  // computes the hash and prefetches the first computed memory location,
  // and the two-arg find(F14HashToken,K) performs the rest of the search.
  template <typename K>
  F14HashToken prehash(K const& key) const {
    FOLLY_SAFE_DCHECK(chunks_ != nullptr, "");
    auto hp = splitHash(this->computeKeyHash(key));
    ChunkPtr firstChunk = chunks_ + (hp.first & chunkMask_);
    prefetchAddr(firstChunk);
    return F14HashToken(std::move(hp));
  }

  template <typename K>
  FOLLY_ALWAYS_INLINE ItemIter find(K const& key) const {
    auto hp = splitHash(this->computeKeyHash(key));
    return findImpl(hp, key);
  }

  template <typename K>
  FOLLY_ALWAYS_INLINE ItemIter
  find(F14HashToken const& token, K const& key) const {
    FOLLY_SAFE_DCHECK(
        splitHash(this->computeKeyHash(key)) == static_cast<HashPair>(token),
        "");
    return findImpl(static_cast<HashPair>(token), key);
  }

  // Searches for a key using a key predicate that is a refinement
  // of key equality.  func(k) should return true only if k is equal
  // to key according to key_eq(), but is allowed to apply additional
  // constraints.
  template <typename K, typename F>
  FOLLY_ALWAYS_INLINE ItemIter findMatching(K const& key, F&& func) const {
    auto hp = splitHash(this->computeKeyHash(key));
    std::size_t index = hp.first;
    std::size_t step = probeDelta(hp);
    for (std::size_t tries = 0; tries <= chunkMask_; ++tries) {
      ChunkPtr chunk = chunks_ + (index & chunkMask_);
      if (sizeof(Chunk) > 64) {
        prefetchAddr(chunk->itemAddr(8));
      }
      auto hits = chunk->tagMatchIter(hp.second);
      while (hits.hasNext()) {
        auto i = hits.next();
        if (LIKELY(
                func(this->keyForValue(this->valueAtItem(chunk->item(i)))))) {
          return ItemIter{chunk, i};
        }
      }
      if (LIKELY(chunk->outboundOverflowCount() == 0)) {
        break;
      }
      index += step;
    }
    return ItemIter{};
  }

 private:
  void adjustSizeAndBeginAfterInsert(ItemIter iter) {
    if (kEnableItemIteration) {
      // packedBegin is the max of all valid ItemIter::pack()
      auto packed = iter.pack();
      if (sizeAndPackedBegin_.packedBegin() < packed) {
        sizeAndPackedBegin_.packedBegin() = packed;
      }
    }

    ++sizeAndPackedBegin_.size_;
  }

  // Ignores hp if pos.chunk()->hostedOverflowCount() == 0
  void eraseBlank(ItemIter iter, HashPair hp) {
    iter.chunk()->clearTag(iter.index());

    if (iter.chunk()->hostedOverflowCount() != 0) {
      // clean up
      std::size_t index = hp.first;
      std::size_t delta = probeDelta(hp);
      uint8_t hostedOp = 0;
      while (true) {
        ChunkPtr chunk = chunks_ + (index & chunkMask_);
        if (chunk == iter.chunk()) {
          chunk->adjustHostedOverflowCount(hostedOp);
          break;
        }
        chunk->decrOutboundOverflowCount();
        hostedOp = Chunk::kDecrHostedOverflowCount;
        index += delta;
      }
    }
  }

  void adjustSizeAndBeginBeforeErase(ItemIter iter) {
    --sizeAndPackedBegin_.size_;
    if (kEnableItemIteration) {
      if (iter.pack() == sizeAndPackedBegin_.packedBegin()) {
        if (size() == 0) {
          iter = ItemIter{};
        } else {
          iter.precheckedAdvance();
        }
        sizeAndPackedBegin_.packedBegin() = iter.pack();
      }
    }
  }

  template <typename... Args>
  void insertAtBlank(ItemIter pos, HashPair hp, Args&&... args) {
    try {
      auto dst = pos.itemAddr();
      this->constructValueAtItem(*this, dst, std::forward<Args>(args)...);
    } catch (...) {
      eraseBlank(pos, hp);
      throw;
    }
    adjustSizeAndBeginAfterInsert(pos);
  }

  ItemIter allocateTag(uint8_t* fullness, HashPair hp) {
    ChunkPtr chunk;
    std::size_t index = hp.first;
    std::size_t delta = probeDelta(hp);
    uint8_t hostedOp = 0;
    while (true) {
      index &= chunkMask_;
      chunk = chunks_ + index;
      if (LIKELY(fullness[index] < Chunk::kCapacity)) {
        break;
      }
      chunk->incrOutboundOverflowCount();
      hostedOp = Chunk::kIncrHostedOverflowCount;
      index += delta;
    }
    unsigned itemIndex = fullness[index]++;
    FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "");
    chunk->setTag(itemIndex, hp.second);
    chunk->adjustHostedOverflowCount(hostedOp);
    return ItemIter{chunk, itemIndex};
  }

  ChunkPtr lastOccupiedChunk() const {
    FOLLY_SAFE_DCHECK(size() > 0, "");
    if (kEnableItemIteration) {
      return begin().chunk();
    } else {
      return chunks_ + chunkMask_;
    }
  }

  template <typename T>
  void directBuildFrom(T&& src) {
    FOLLY_SAFE_DCHECK(src.size() > 0 && chunkMask_ == src.chunkMask_, "");

    // We use std::forward<T> to allow portions of src to be moved out by
    // either beforeBuild or afterBuild, but we are just relying on good
    // behavior of our Policy superclass to ensure that any particular
    // field of this is a donor at most once.

    auto undoState =
        this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src));
    bool success = false;
    SCOPE_EXIT {
      this->afterBuild(
          undoState, success, src.size(), bucket_count(), std::forward<T>(src));
    };

    // Copy can fail part-way through if a Value copy constructor throws.
    // Failing afterBuild is limited in its cleanup power in this case,
    // because it can't enumerate the items that were actually copied.
    // Fortunately we can divide the situation into cases where all of
    // the state is owned by the table itself (F14Node and F14Value),
    // for which clearImpl() can do partial cleanup, and cases where all
    // of the values are owned by the policy (F14Vector), in which case
    // partial failure should not occur.  Sorry for the subtle invariants
    // in the Policy API.

    if (is_trivially_copyable<Item>::value && !this->destroyItemOnClear() &&
        itemCount() == src.itemCount()) {
      FOLLY_SAFE_DCHECK(chunkMask_ == src.chunkMask_, "");

      auto scale = chunks_->capacityScale();

      // most happy path
      auto n = chunkAllocSize(chunkMask_ + 1, scale);
      std::memcpy(&chunks_[0], &src.chunks_[0], n);
      sizeAndPackedBegin_.size_ = src.size();
      if (kEnableItemIteration) {
        auto srcBegin = src.begin();
        sizeAndPackedBegin_.packedBegin() =
            ItemIter{chunks_ + (srcBegin.chunk() - src.chunks_),
                     srcBegin.index()}
                .pack();
      }
      if (kContinuousCapacity) {
        // capacityScale might not match even if itemCount matches
        chunks_->setCapacityScale(scale);
      }
    } else {
      // Happy path, no rehash but pack items toward bottom of chunk
      // and use copy constructor.  Don't try to optimize by using
      // lastOccupiedChunk() because there may be higher unoccupied chunks
      // with the overflow bit set.
      auto srcChunk = &src.chunks_[chunkMask_];
      Chunk* dstChunk = &chunks_[chunkMask_];
      do {
        dstChunk->copyOverflowInfoFrom(*srcChunk);

        auto iter = srcChunk->occupiedIter();
        if (prefetchBeforeCopy()) {
          for (auto piter = iter; piter.hasNext();) {
            this->prefetchValue(srcChunk->citem(piter.next()));
          }
        }

        std::size_t dstI = 0;
        for (; iter.hasNext(); ++dstI) {
          auto srcI = iter.next();
          auto&& srcArg =
              std::forward<T>(src).buildArgForItem(srcChunk->item(srcI));
          auto dst = dstChunk->itemAddr(dstI);
          this->constructValueAtItem(
              0, dst, std::forward<decltype(srcArg)>(srcArg));
          dstChunk->setTag(dstI, srcChunk->tag(srcI));
          ++sizeAndPackedBegin_.size_;
        }

        --srcChunk;
        --dstChunk;
      } while (size() != src.size());

      // reset doesn't care about packedBegin, so we don't fix it until the end
      if (kEnableItemIteration) {
        std::size_t maxChunkIndex = src.lastOccupiedChunk() - src.chunks_;
        sizeAndPackedBegin_.packedBegin() =
            ItemIter{chunks_ + maxChunkIndex,
                     chunks_[maxChunkIndex].lastOccupied().index()}
                .pack();
      }
    }

    success = true;
  }

  template <typename T>
  void rehashBuildFrom(T&& src) {
    FOLLY_SAFE_DCHECK(src.chunkMask_ > chunkMask_, "");

    // 1 byte per chunk means < 1 bit per value temporary overhead
    std::array<uint8_t, 256> stackBuf;
    uint8_t* fullness;
    auto cc = chunkMask_ + 1;
    if (cc <= stackBuf.size()) {
      fullness = stackBuf.data();
    } else {
      ByteAlloc a{this->alloc()};
      fullness = &*std::allocator_traits<ByteAlloc>::allocate(a, cc);
    }
    SCOPE_EXIT {
      if (cc > stackBuf.size()) {
        ByteAlloc a{this->alloc()};
        std::allocator_traits<ByteAlloc>::deallocate(
            a,
            std::pointer_traits<typename std::allocator_traits<
                ByteAlloc>::pointer>::pointer_to(*fullness),
            cc);
      }
    };
    std::memset(fullness, '\0', cc);

    // We use std::forward<T> to allow portions of src to be moved out by
    // either beforeBuild or afterBuild, but we are just relying on good
    // behavior of our Policy superclass to ensure that any particular
    // field of this is a donor at most once.

    // Exception safety requires beforeBuild to happen after all of the
    // allocate() calls.
    auto undoState =
        this->beforeBuild(src.size(), bucket_count(), std::forward<T>(src));
    bool success = false;
    SCOPE_EXIT {
      this->afterBuild(
          undoState, success, src.size(), bucket_count(), std::forward<T>(src));
    };

    // The current table is at a valid state at all points for policies
    // in which non-trivial values are owned by the main table (F14Node
    // and F14Value), so reset() will clean things up properly if we
    // fail partway through.  For the case that the policy manages value
    // lifecycle (F14Vector) then nothing after beforeBuild can throw and
    // we don't have to worry about partial failure.

    std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_;
    while (true) {
      auto srcChunk = &src.chunks_[srcChunkIndex];
      auto iter = srcChunk->occupiedIter();
      if (prefetchBeforeRehash()) {
        for (auto piter = iter; piter.hasNext();) {
          this->prefetchValue(srcChunk->item(piter.next()));
        }
      }
      if (srcChunk->hostedOverflowCount() == 0) {
        // all items are in their preferred chunk (no probing), so we
        // don't need to compute any hash values
        while (iter.hasNext()) {
          auto i = iter.next();
          auto& srcItem = srcChunk->item(i);
          auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem);
          HashPair hp{srcChunkIndex, srcChunk->tag(i)};
          insertAtBlank(
              allocateTag(fullness, hp),
              hp,
              std::forward<decltype(srcArg)>(srcArg));
        }
      } else {
        // any chunk's items might be in here
        while (iter.hasNext()) {
          auto i = iter.next();
          auto& srcItem = srcChunk->item(i);
          auto&& srcArg = std::forward<T>(src).buildArgForItem(srcItem);
          auto const& srcKey = src.keyForValue(srcArg);
          auto hp = splitHash(this->computeKeyHash(srcKey));
          FOLLY_SAFE_CHECK(hp.second == srcChunk->tag(i), "");
          insertAtBlank(
              allocateTag(fullness, hp),
              hp,
              std::forward<decltype(srcArg)>(srcArg));
        }
      }
      if (srcChunkIndex == 0) {
        break;
      }
      --srcChunkIndex;
    }

    success = true;
  }

  template <typename T>
  FOLLY_NOINLINE void buildFromF14Table(T&& src) {
    FOLLY_SAFE_DCHECK(bucket_count() == 0, "");
    if (src.size() == 0) {
      return;
    }

    // Use the source's capacity, unless it is oversized.
    auto upperLimit = computeChunkCountAndScale(src.size(), false, false);
    auto ccas = std::make_pair(
        std::size_t{src.chunkMask_} + 1, src.chunks_->capacityScale());
    FOLLY_SAFE_DCHECK(
        ccas.first >= upperLimit.first,
        "rounded chunk count can't be bigger than actual");
    if (ccas.first > upperLimit.first || ccas.second > upperLimit.second) {
      ccas = upperLimit;
    }
    rehashImpl(0, 1, 0, ccas.first, ccas.second);

    try {
      if (chunkMask_ == src.chunkMask_) {
        directBuildFrom(std::forward<T>(src));
      } else {
        rehashBuildFrom(std::forward<T>(src));
      }
    } catch (...) {
      reset();
      F14LinkCheck<getF14IntrinsicsMode()>::check();
      throw;
    }
  }

  void reserveImpl(std::size_t desiredCapacity) {
    desiredCapacity = std::max<std::size_t>(desiredCapacity, size());
    if (desiredCapacity == 0) {
      reset();
      return;
    }

    auto origChunkCount = chunkMask_ + 1;
    auto origCapacityScale = chunks_->capacityScale();
    auto origCapacity = computeCapacity(origChunkCount, origCapacityScale);

    // This came from an explicit reserve() or rehash() call, so there's
    // a good chance the capacity is exactly right.  To avoid O(n^2)
    // behavior, we don't do rehashes that decrease the size by less
    // than 1/8, and if we have a requested increase of less than 1/8 we
    // instead go to the next power of two.

    if (desiredCapacity <= origCapacity &&
        desiredCapacity >= origCapacity - origCapacity / 8) {
      return;
    }
    bool attemptExact =
        !(desiredCapacity > origCapacity &&
          desiredCapacity < origCapacity + origCapacity / 8);

    std::size_t newChunkCount;
    std::size_t newCapacityScale;
    std::tie(newChunkCount, newCapacityScale) = computeChunkCountAndScale(
        desiredCapacity, attemptExact, kContinuousCapacity && attemptExact);
    auto newCapacity = computeCapacity(newChunkCount, newCapacityScale);

    if (origCapacity != newCapacity) {
      rehashImpl(
          size(),
          origChunkCount,
          origCapacityScale,
          newChunkCount,
          newCapacityScale);
    }
  }

  FOLLY_NOINLINE void reserveForInsertImpl(
      std::size_t capacityMinusOne,
      std::size_t origChunkCount,
      std::size_t origCapacityScale,
      std::size_t origCapacity) {
    FOLLY_SAFE_DCHECK(capacityMinusOne >= size(), "");
    std::size_t capacity = capacityMinusOne + 1;

    // we want to grow by between 2^0.5 and 2^1.5 ending at a "good"
    // size, so we grow by 2^0.5 and then round up

    // 1.01101_2 = 1.40625
    std::size_t minGrowth = origCapacity + (origCapacity >> 2) +
        (origCapacity >> 3) + (origCapacity >> 5);
    capacity = std::max<std::size_t>(capacity, minGrowth);

    std::size_t newChunkCount;
    std::size_t newCapacityScale;
    std::tie(newChunkCount, newCapacityScale) =
        computeChunkCountAndScale(capacity, false, false);

    FOLLY_SAFE_DCHECK(
        computeCapacity(newChunkCount, newCapacityScale) > origCapacity, "");

    rehashImpl(
        size(),
        origChunkCount,
        origCapacityScale,
        newChunkCount,
        newCapacityScale);
  }

  void rehashImpl(
      std::size_t origSize,
      std::size_t origChunkCount,
      std::size_t origCapacityScale,
      std::size_t newChunkCount,
      std::size_t newCapacityScale) {
    auto origChunks = chunks_;
    auto origCapacity = computeCapacity(origChunkCount, origCapacityScale);
    auto origAllocSize = chunkAllocSize(origChunkCount, origCapacityScale);
    auto newCapacity = computeCapacity(newChunkCount, newCapacityScale);
    auto newAllocSize = chunkAllocSize(newChunkCount, newCapacityScale);

    BytePtr rawAllocation;
    auto undoState = this->beforeRehash(
        origSize, origCapacity, newCapacity, newAllocSize, rawAllocation);
    chunks_ = initializeChunks(rawAllocation, newChunkCount, newCapacityScale);

    FOLLY_SAFE_DCHECK(
        newChunkCount < std::numeric_limits<InternalSizeType>::max(), "");
    chunkMask_ = static_cast<InternalSizeType>(newChunkCount - 1);

    bool success = false;
    SCOPE_EXIT {
      // this SCOPE_EXIT reverts chunks_ and chunkMask_ if necessary
      BytePtr finishedRawAllocation = nullptr;
      std::size_t finishedAllocSize = 0;
      if (LIKELY(success)) {
        if (origCapacity > 0) {
          finishedRawAllocation = std::pointer_traits<BytePtr>::pointer_to(
              *static_cast<uint8_t*>(static_cast<void*>(&*origChunks)));
          finishedAllocSize = origAllocSize;
        }
      } else {
        finishedRawAllocation = rawAllocation;
        finishedAllocSize = newAllocSize;
        chunks_ = origChunks;
        FOLLY_SAFE_DCHECK(
            origChunkCount < std::numeric_limits<InternalSizeType>::max(), "");
        chunkMask_ = static_cast<InternalSizeType>(origChunkCount - 1);
        F14LinkCheck<getF14IntrinsicsMode()>::check();
      }

      this->afterRehash(
          std::move(undoState),
          success,
          origSize,
          origCapacity,
          newCapacity,
          finishedRawAllocation,
          finishedAllocSize);
    };

    if (origSize == 0) {
      // nothing to do
    } else if (origChunkCount == 1 && newChunkCount == 1) {
      // no mask, no chunk scan, no hash computation, no probing
      auto srcChunk = origChunks;
      auto dstChunk = chunks_;
      std::size_t srcI = 0;
      std::size_t dstI = 0;
      while (dstI < origSize) {
        if (LIKELY(srcChunk->occupied(srcI))) {
          dstChunk->setTag(dstI, srcChunk->tag(srcI));
          this->moveItemDuringRehash(
              dstChunk->itemAddr(dstI), srcChunk->item(srcI));
          ++dstI;
        }
        ++srcI;
      }
      if (kEnableItemIteration) {
        sizeAndPackedBegin_.packedBegin() = ItemIter{dstChunk, dstI - 1}.pack();
      }
    } else {
      // 1 byte per chunk means < 1 bit per value temporary overhead
      std::array<uint8_t, 256> stackBuf;
      uint8_t* fullness;
      if (newChunkCount <= stackBuf.size()) {
        fullness = stackBuf.data();
      } else {
        ByteAlloc a{this->alloc()};
        // may throw
        fullness =
            &*std::allocator_traits<ByteAlloc>::allocate(a, newChunkCount);
      }
      std::memset(fullness, '\0', newChunkCount);
      SCOPE_EXIT {
        if (newChunkCount > stackBuf.size()) {
          ByteAlloc a{this->alloc()};
          std::allocator_traits<ByteAlloc>::deallocate(
              a,
              std::pointer_traits<typename std::allocator_traits<
                  ByteAlloc>::pointer>::pointer_to(*fullness),
              newChunkCount);
        }
      };

      auto srcChunk = origChunks + origChunkCount - 1;
      std::size_t remaining = origSize;
      while (remaining > 0) {
        auto iter = srcChunk->occupiedIter();
        if (prefetchBeforeRehash()) {
          for (auto piter = iter; piter.hasNext();) {
            this->prefetchValue(srcChunk->item(piter.next()));
          }
        }
        while (iter.hasNext()) {
          --remaining;
          auto srcI = iter.next();
          Item& srcItem = srcChunk->item(srcI);
          auto hp = splitHash(
              this->computeItemHash(const_cast<Item const&>(srcItem)));
          FOLLY_SAFE_CHECK(hp.second == srcChunk->tag(srcI), "");

          auto dstIter = allocateTag(fullness, hp);
          this->moveItemDuringRehash(dstIter.itemAddr(), srcItem);
        }
        --srcChunk;
      }

      if (kEnableItemIteration) {
        // this code replaces size invocations of adjustSizeAndBeginAfterInsert
        std::size_t i = chunkMask_;
        while (fullness[i] == 0) {
          --i;
        }
        sizeAndPackedBegin_.packedBegin() =
            ItemIter{chunks_ + i, std::size_t{fullness[i]} - 1}.pack();
      }
    }

    success = true;
  }

  // Randomization to help expose bugs when running tests in debug or
  // sanitizer builds

  FOLLY_ALWAYS_INLINE void debugModeOnReserve(std::size_t capacity) {
    if (kIsLibrarySanitizeAddress || kIsDebug) {
      if (capacity > size()) {
        tlsPendingSafeInserts(static_cast<std::ptrdiff_t>(capacity - size()));
      }
    }
  }

  void debugModeSpuriousRehash() {
    auto cc = chunkMask_ + 1;
    auto ss = chunks_->capacityScale();
    rehashImpl(size(), cc, ss, cc, ss);
  }

  FOLLY_ALWAYS_INLINE void debugModeBeforeInsert() {
    // When running under ASAN, we add a spurious rehash with 1/size()
    // probability before every insert.  This means that finding reference
    // stability problems for F14Value and F14Vector is much more likely.
    // The most common pattern that causes this is
    //
    //   auto& ref = map[k1]; map[k2] = foo(ref);
    //
    // One way to fix this is to call map.reserve(N) before such a
    // sequence, where N is the number of keys that might be inserted
    // within the section that retains references plus the existing size.
    if (kIsLibrarySanitizeAddress && !tlsPendingSafeInserts() && size() > 0 &&
        tlsMinstdRand(size()) == 0) {
      debugModeSpuriousRehash();
    }
  }

  FOLLY_ALWAYS_INLINE void debugModeAfterInsert() {
    if (kIsLibrarySanitizeAddress || kIsDebug) {
      tlsPendingSafeInserts(-1);
    }
  }

  FOLLY_ALWAYS_INLINE void debugModePerturbSlotInsertOrder(
      ChunkPtr chunk,
      std::size_t& itemIndex) {
    FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), "");
    constexpr bool perturbSlot = FOLLY_F14_PERTURB_INSERTION_ORDER;
    if (perturbSlot && !tlsPendingSafeInserts()) {
      std::size_t e = chunkMask_ == 0 ? bucket_count() : Chunk::kCapacity;
      std::size_t i = itemIndex + tlsMinstdRand(e - itemIndex);
      if (!chunk->occupied(i)) {
        itemIndex = i;
      }
    }
  }

 public:
  // user has no control over max_load_factor

  void rehash(std::size_t capacity) {
    reserve(capacity);
  }

  void reserve(std::size_t capacity) {
    // We want to support the pattern
    //   map.reserve(map.size() + 2); auto& r1 = map[k1]; auto& r2 = map[k2];
    debugModeOnReserve(capacity);
    reserveImpl(capacity);
  }

  void reserveForInsert(size_t incoming = 1) {
    FOLLY_SAFE_DCHECK(incoming > 0, "");

    auto needed = size() + incoming;
    auto chunkCount = chunkMask_ + 1;
    auto scale = chunks_->capacityScale();
    auto existing = computeCapacity(chunkCount, scale);
    if (needed - 1 >= existing) {
      reserveForInsertImpl(needed - 1, chunkCount, scale, existing);
    }
  }

  // Returns pos,true if construct, pos,false if found.  key is only used
  // during the search; all constructor args for an inserted value come
  // from args...  key won't be accessed after args are touched.
  template <typename K, typename... Args>
  std::pair<ItemIter, bool> tryEmplaceValue(K const& key, Args&&... args) {
    const auto hp = splitHash(this->computeKeyHash(key));

    if (size() > 0) {
      auto existing = findImpl(hp, key);
      if (!existing.atEnd()) {
        return std::make_pair(existing, false);
      }
    }

    debugModeBeforeInsert();

    reserveForInsert();

    std::size_t index = hp.first;
    ChunkPtr chunk = chunks_ + (index & chunkMask_);
    auto firstEmpty = chunk->firstEmpty();

    if (!firstEmpty.hasIndex()) {
      std::size_t delta = probeDelta(hp);
      do {
        chunk->incrOutboundOverflowCount();
        index += delta;
        chunk = chunks_ + (index & chunkMask_);
        firstEmpty = chunk->firstEmpty();
      } while (!firstEmpty.hasIndex());
      chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount);
    }
    std::size_t itemIndex = firstEmpty.index();

    debugModePerturbSlotInsertOrder(chunk, itemIndex);

    chunk->setTag(itemIndex, hp.second);
    ItemIter iter{chunk, itemIndex};

    // insertAtBlank will clear the tag if the constructor throws
    insertAtBlank(iter, hp, std::forward<Args>(args)...);

    debugModeAfterInsert();

    return std::make_pair(iter, true);
  }

 private:
  template <bool Reset>
  void clearImpl() noexcept {
    if (chunks_ == Chunk::emptyInstance()) {
      FOLLY_SAFE_DCHECK(empty() && bucket_count() == 0, "");
      return;
    }

    // turn clear into reset if the table is >= 16 chunks so that
    // we don't get too low a load factor
    bool willReset = Reset || chunkMask_ + 1 >= 16;

    auto origSize = size();
    auto origCapacity = bucket_count();
    if (willReset) {
      this->beforeReset(origSize, origCapacity);
    } else {
      this->beforeClear(origSize, origCapacity);
    }

    if (!empty()) {
      if (destroyItemOnClear()) {
        for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
          ChunkPtr chunk = chunks_ + ci;
          auto iter = chunk->occupiedIter();
          if (prefetchBeforeDestroy()) {
            for (auto piter = iter; piter.hasNext();) {
              this->prefetchValue(chunk->item(piter.next()));
            }
          }
          while (iter.hasNext()) {
            this->destroyItem(chunk->item(iter.next()));
          }
        }
      }
      if (!willReset) {
        // It's okay to do this in a separate loop because we only do it
        // when the chunk count is small.  That avoids a branch when we
        // are promoting a clear to a reset for a large table.
        auto scale = chunks_[0].capacityScale();
        for (std::size_t ci = 0; ci <= chunkMask_; ++ci) {
          chunks_[ci].clear();
        }
        chunks_[0].markEof(scale);
      }
      if (kEnableItemIteration) {
        sizeAndPackedBegin_.packedBegin() = ItemIter{}.pack();
      }
      sizeAndPackedBegin_.size_ = 0;
    }

    if (willReset) {
      BytePtr rawAllocation = std::pointer_traits<BytePtr>::pointer_to(
          *static_cast<uint8_t*>(static_cast<void*>(&*chunks_)));
      std::size_t rawSize =
          chunkAllocSize(chunkMask_ + 1, chunks_->capacityScale());

      chunks_ = Chunk::emptyInstance();
      chunkMask_ = 0;

      this->afterReset(origSize, origCapacity, rawAllocation, rawSize);
    } else {
      this->afterClear(origSize, origCapacity);
    }
  }

  void eraseImpl(ItemIter pos, HashPair hp) {
    this->destroyItem(pos.item());
    adjustSizeAndBeginBeforeErase(pos);
    eraseBlank(pos, hp);
  }

 public:
  // The item needs to still be hashable during this call.  If you want
  // to intercept the value before it is destroyed (to extract it, for
  // example), do so in the beforeDestroy callback.
  template <typename BeforeDestroy>
  void eraseIterInto(ItemIter pos, BeforeDestroy&& beforeDestroy) {
    HashPair hp{};
    if (pos.chunk()->hostedOverflowCount() != 0) {
      hp = splitHash(this->computeItemHash(pos.citem()));
    }
    beforeDestroy(this->valueAtItemForExtract(pos.item()));
    eraseImpl(pos, hp);
  }

  template <typename K, typename BeforeDestroy>
  std::size_t eraseKeyInto(K const& key, BeforeDestroy&& beforeDestroy) {
    if (UNLIKELY(size() == 0)) {
      return 0;
    }
    auto hp = splitHash(this->computeKeyHash(key));
    auto iter = findImpl(hp, key);
    if (!iter.atEnd()) {
      beforeDestroy(this->valueAtItemForExtract(iter.item()));
      eraseImpl(iter, hp);
      return 1;
    } else {
      return 0;
    }
  }

  void clear() noexcept {
    if (kIsLibrarySanitizeAddress) {
      // force recycling of heap memory
      auto bc = bucket_count();
      reset();
      try {
        reserveImpl(bc);
      } catch (std::bad_alloc const&) {
        // ASAN mode only, keep going
      }
    } else {
      clearImpl<false>();
    }
  }

  // Like clear(), but always frees all dynamic storage allocated
  // by the table.
  void reset() noexcept {
    clearImpl<true>();
  }

  // Get memory footprint, not including sizeof(*this).
  std::size_t getAllocatedMemorySize() const {
    std::size_t sum = 0;
    visitAllocationClasses(
        [&sum](std::size_t bytes, std::size_t n) { sum += bytes * n; });
    return sum;
  }

  // Enumerates classes of allocated memory blocks currently owned
  // by this table, calling visitor(allocationSize, allocationCount).
  // This can be used to get a more accurate indication of memory footprint
  // than getAllocatedMemorySize() if you have some way of computing the
  // internal fragmentation of the allocator, such as JEMalloc's nallocx.
  // The visitor might be called twice with the same allocationSize. The
  // visitor's computation should produce the same result for visitor(8,
  // 2) as for two calls to visitor(8, 1), for example.  The visitor may
  // be called with a zero allocationCount.
  template <typename V>
  void visitAllocationClasses(V&& visitor) const {
    auto scale = chunks_->capacityScale();
    this->visitPolicyAllocationClasses(
        scale == 0 ? 0 : chunkAllocSize(chunkMask_ + 1, scale),
        size(),
        bucket_count(),
        visitor);
  }

  // visitor should take an Item const&
  template <typename V>
  void visitItems(V&& visitor) const {
    if (empty()) {
      return;
    }
    std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_;
    auto chunk = &chunks_[0];
    for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) {
      auto iter = chunk->occupiedIter();
      if (prefetchBeforeCopy()) {
        for (auto piter = iter; piter.hasNext();) {
          this->prefetchValue(chunk->citem(piter.next()));
        }
      }
      while (iter.hasNext()) {
        visitor(chunk->citem(iter.next()));
      }
    }
  }

  // visitor should take two Item const*
  template <typename V>
  void visitContiguousItemRanges(V&& visitor) const {
    if (empty()) {
      return;
    }
    std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_;
    auto chunk = &chunks_[0];
    for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) {
      for (auto iter = chunk->occupiedRangeIter(); iter.hasNext();) {
        auto be = iter.next();
        FOLLY_SAFE_DCHECK(
            chunk->occupied(be.first) && chunk->occupied(be.second - 1), "");
        Item const* b = chunk->itemAddr(be.first);
        visitor(b, b + (be.second - be.first));
      }
    }
  }

 private:
  static std::size_t& histoAt(
      std::vector<std::size_t>& histo,
      std::size_t index) {
    if (histo.size() <= index) {
      histo.resize(index + 1);
    }
    return histo.at(index);
  }

 public:
  // Expensive
  F14TableStats computeStats() const {
    F14TableStats stats;

    if (kIsDebug && kEnableItemIteration) {
      // validate iteration
      std::size_t n = 0;
      ItemIter prev;
      for (auto iter = begin(); iter != end(); iter.advance()) {
        FOLLY_SAFE_DCHECK(n == 0 || iter.pack() < prev.pack(), "");
        ++n;
        prev = iter;
      }
      FOLLY_SAFE_DCHECK(n == size(), "");
    }

    FOLLY_SAFE_DCHECK(
        (chunks_ == Chunk::emptyInstance()) == (bucket_count() == 0), "");

    std::size_t n1 = 0;
    std::size_t n2 = 0;
    auto cc = bucket_count() == 0 ? 0 : chunkMask_ + 1;
    for (std::size_t ci = 0; ci < cc; ++ci) {
      ChunkPtr chunk = chunks_ + ci;
      FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), "");

      auto iter = chunk->occupiedIter();

      std::size_t chunkOccupied = 0;
      for (auto piter = iter; piter.hasNext(); piter.next()) {
        ++chunkOccupied;
      }
      n1 += chunkOccupied;

      histoAt(stats.chunkOccupancyHisto, chunkOccupied)++;
      histoAt(
          stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++;
      histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++;

      while (iter.hasNext()) {
        auto ii = iter.next();
        ++n2;

        {
          auto& item = chunk->citem(ii);
          auto hp = splitHash(this->computeItemHash(item));
          FOLLY_SAFE_DCHECK(chunk->tag(ii) == hp.second, "");

          std::size_t dist = 1;
          std::size_t index = hp.first;
          std::size_t delta = probeDelta(hp);
          while ((index & chunkMask_) != ci) {
            index += delta;
            ++dist;
          }

          histoAt(stats.keyProbeLengthHisto, dist)++;
        }

        // misses could have any tag, so we do the dumb but accurate
        // thing and just try them all
        for (std::size_t ti = 0; ti < 256; ++ti) {
          uint8_t tag = static_cast<uint8_t>(ti == 0 ? 1 : 0);
          HashPair hp{ci, tag};

          std::size_t dist = 1;
          std::size_t index = hp.first;
          std::size_t delta = probeDelta(hp);
          for (std::size_t tries = 0; tries <= chunkMask_ &&
               chunks_[index & chunkMask_].outboundOverflowCount() != 0;
               ++tries) {
            index += delta;
            ++dist;
          }

          histoAt(stats.missProbeLengthHisto, dist)++;
        }
      }
    }

    FOLLY_SAFE_DCHECK(n1 == size(), "");
    FOLLY_SAFE_DCHECK(n2 == size(), "");

    stats.policy = pretty_name<Policy>();
    stats.size = size();
    stats.valueSize = sizeof(value_type);
    stats.bucketCount = bucket_count();
    stats.chunkCount = cc;

    stats.totalBytes = sizeof(*this) + getAllocatedMemorySize();
    stats.overheadBytes = stats.totalBytes - size() * sizeof(value_type);

    return stats;
  }
};
} // namespace detail
} // namespace f14

#endif // FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE

namespace f14 {
namespace test {
inline void disableInsertOrderRandomization() {
  if (kIsLibrarySanitizeAddress || kIsDebug) {
    detail::tlsPendingSafeInserts(static_cast<std::ptrdiff_t>(
        (std::numeric_limits<std::size_t>::max)() / 2));
  }
}
} // namespace test
} // namespace f14
} // namespace folly