/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #if FOLLY_HAS_STRING_VIEW #include // @manual #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if FOLLY_LIBRARY_SANITIZE_ADDRESS && defined(FOLLY_TLS) #define FOLLY_F14_TLS_IF_ASAN FOLLY_TLS #else #define FOLLY_F14_TLS_IF_ASAN #endif #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE #if FOLLY_NEON #include // __crc32cd #else #include // _mm_crc32_u64 #endif #else #ifdef _WIN32 #include // _mul128 in fallback bit mixer #endif #endif #if FOLLY_NEON #include // uint8x16t intrinsics #else // SSE2 #include // _mm_set1_epi8 #include // __m128i intrinsics #include // _mm_prefetch #endif #endif #ifndef FOLLY_F14_PERTURB_INSERTION_ORDER #define FOLLY_F14_PERTURB_INSERTION_ORDER folly::kIsDebug #endif namespace folly { struct F14TableStats { char const* policy; std::size_t size{0}; std::size_t valueSize{0}; std::size_t bucketCount{0}; std::size_t chunkCount{0}; std::vector chunkOccupancyHisto; std::vector chunkOutboundOverflowHisto; std::vector chunkHostedOverflowHisto; std::vector keyProbeLengthHisto; std::vector missProbeLengthHisto; std::size_t totalBytes{0}; std::size_t overheadBytes{0}; private: template static auto computeHelper(T const* m) -> decltype(m->computeStats()) { return m->computeStats(); } static F14TableStats computeHelper(...) { return {}; } public: template static F14TableStats compute(T const& m) { return computeHelper(&m); } }; namespace f14 { namespace detail { template struct F14LinkCheck {}; template <> struct F14LinkCheck { // The purpose of this method is to trigger a link failure if // compilation flags vary across compilation units. The definition // is in F14Table.cpp, so only one of F14LinkCheck::check, // F14LinkCheck::check, or F14LinkCheck::check will // be available at link time. // // To cause a link failure the function must be invoked in code that // is not optimized away, so we call it on a couple of cold paths // (exception handling paths in copy construction and rehash). LTO may // remove it entirely, but that's fine. static void check() noexcept; }; bool tlsPendingSafeInserts(std::ptrdiff_t delta = 0); std::size_t tlsMinstdRand(std::size_t n); #if defined(_LIBCPP_VERSION) template struct StdNodeReplica { void* next; std::size_t hash; V value; }; #else template struct StdIsFastHash : std::true_type {}; template <> struct StdIsFastHash> : std::false_type {}; template struct StdIsFastHash>> : std::false_type { }; #if FOLLY_HAS_STRING_VIEW template struct StdIsFastHash>> : std::false_type {}; #endif // mimic internal node of unordered containers in STL to estimate the size template struct StdNodeReplica { void* next; V value; }; template struct StdNodeReplica< K, V, H, std::enable_if_t< !StdIsFastHash::value || !is_nothrow_invocable_v>> { void* next; V value; std::size_t hash; }; #endif template void erase_if_impl(Container& c, Predicate& predicate) { for (auto i = c.begin(), last = c.end(); i != last;) { auto prev = i++; if (predicate(*prev)) { c.erase(prev); } } } } // namespace detail } // namespace f14 #if FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE namespace f14 { namespace detail { template class F14Table; } // namespace detail } // namespace f14 class F14HashToken final { public: F14HashToken() = default; private: using HashPair = std::pair; explicit F14HashToken(HashPair hp) : hp_(hp) {} explicit operator HashPair() const { return hp_; } HashPair hp_; template friend class f14::detail::F14Table; }; namespace f14 { namespace detail { //// Defaults should be selected using void template using VoidDefault = std::conditional_t::value, void, Arg>; template using Defaulted = std::conditional_t::value, Default, Arg>; template < typename TableKey, typename Hasher, typename KeyEqual, typename ArgKey> struct EligibleForHeterogeneousFind : Conjunction< is_transparent, is_transparent, is_invocable, is_invocable> {}; template < typename TableKey, typename Hasher, typename KeyEqual, typename ArgKey> using EligibleForHeterogeneousInsert = Conjunction< EligibleForHeterogeneousFind, std::is_constructible>; //////////////// template FOLLY_ALWAYS_INLINE static void prefetchAddr(T const* ptr) { #ifndef _WIN32 __builtin_prefetch(static_cast(ptr)); #elif FOLLY_NEON __prefetch(static_cast(ptr)); #else _mm_prefetch( static_cast(static_cast(ptr)), _MM_HINT_T0); #endif } #if FOLLY_NEON using TagVector = uint8x16_t; #else // SSE2 using TagVector = __m128i; #endif // We could use unaligned loads to relax this requirement, but that // would be both a performance penalty and require a bulkier packed // ItemIter format constexpr std::size_t kRequiredVectorAlignment = constexpr_max(std::size_t{16}, alignof(max_align_t)); using EmptyTagVectorType = std::aligned_storage_t< sizeof(TagVector) + kRequiredVectorAlignment, alignof(max_align_t)>; extern EmptyTagVectorType kEmptyTagVector; template struct alignas(kRequiredVectorAlignment) F14Chunk { using Item = ItemType; // For our 16 byte vector alignment (and assuming alignof(Item) >= // 4) kCapacity of 14 is the most space efficient. Slightly smaller // or larger capacities can help with cache alignment in a couple of // cases without wasting too much space, but once the items are larger // then we're unlikely to get much benefit anyway. The only case we // optimize is using kCapacity of 12 for 4 byte items, which makes the // chunk take exactly 1 cache line, and adding 16 bytes of padding for // 16 byte items so that a chunk takes exactly 4 cache lines. static constexpr unsigned kCapacity = sizeof(Item) == 4 ? 12 : 14; static constexpr unsigned kDesiredCapacity = kCapacity - 2; static constexpr unsigned kAllocatedCapacity = kCapacity + (sizeof(Item) == 16 ? 1 : 0); // If kCapacity == 12 then we get 16 bits of capacityScale by using // tag 12 and 13, otherwise we only get 4 bits of control_ static constexpr std::size_t kCapacityScaleBits = kCapacity == 12 ? 16 : 4; static constexpr std::size_t kCapacityScaleShift = kCapacityScaleBits - 4; static constexpr MaskType kFullMask = FullMask::value; // Non-empty tags have their top bit set. tags_ array might be bigger // than kCapacity to keep alignment of first item. std::array tags_; // Bits 0..3 of chunk 0 record the scaling factor between the number of // chunks and the max size without rehash. Bits 4-7 in any chunk are a // 4-bit counter of the number of values in this chunk that were placed // because they overflowed their desired chunk (hostedOverflowCount). uint8_t control_; // The number of values that would have been placed into this chunk if // there had been space, including values that also overflowed previous // full chunks. This value saturates; once it becomes 255 it no longer // increases nor decreases. uint8_t outboundOverflowCount_; std::array, kAllocatedCapacity> rawItems_; static F14Chunk* emptyInstance() { auto raw = reinterpret_cast(&kEmptyTagVector); if (kRequiredVectorAlignment > alignof(max_align_t)) { auto delta = kRequiredVectorAlignment - (reinterpret_cast(raw) % kRequiredVectorAlignment); raw += delta; } auto rv = reinterpret_cast(raw); FOLLY_SAFE_DCHECK( (reinterpret_cast(rv) % kRequiredVectorAlignment) == 0, ""); return rv; } void clear() { // tags_ = {}; control_ = 0; outboundOverflowCount_ = 0; // gcc < 6 doesn't exploit chunk alignment to generate the optimal // SSE clear from memset. This is very hot code, so it is worth // handling that case specially. #if FOLLY_SSE >= 2 && __GNUC__ <= 5 && !__clang__ // this doesn't violate strict aliasing rules because __m128i is // tagged as __may_alias__ auto* v = static_cast<__m128i*>(static_cast(&tags_[0])); _mm_store_si128(v, _mm_setzero_si128()); #else std::memset(&tags_[0], '\0', 16); #endif } void copyOverflowInfoFrom(F14Chunk const& rhs) { FOLLY_SAFE_DCHECK(hostedOverflowCount() == 0, ""); control_ += static_cast(rhs.control_ & 0xf0); outboundOverflowCount_ = rhs.outboundOverflowCount_; } unsigned hostedOverflowCount() const { return control_ >> 4; } static constexpr uint8_t kIncrHostedOverflowCount = 0x10; static constexpr uint8_t kDecrHostedOverflowCount = static_cast(-0x10); void adjustHostedOverflowCount(uint8_t op) { control_ += op; } bool eof() const { return capacityScale() != 0; } std::size_t capacityScale() const { if (kCapacityScaleBits == 4) { return control_ & 0xf; } else { uint16_t v; std::memcpy(&v, &tags_[12], 2); return v; } } void setCapacityScale(std::size_t scale) { FOLLY_SAFE_DCHECK( this != emptyInstance() && scale > 0 && scale < (std::size_t{1} << kCapacityScaleBits), ""); if (kCapacityScaleBits == 4) { control_ = static_cast((control_ & ~0xf) | scale); } else { uint16_t v = static_cast(scale); std::memcpy(&tags_[12], &v, 2); } } void markEof(std::size_t scale) { folly::assume(control_ == 0); setCapacityScale(scale); } unsigned outboundOverflowCount() const { return outboundOverflowCount_; } void incrOutboundOverflowCount() { if (outboundOverflowCount_ != 255) { ++outboundOverflowCount_; } } void decrOutboundOverflowCount() { if (outboundOverflowCount_ != 255) { --outboundOverflowCount_; } } std::size_t tag(std::size_t index) const { return tags_[index]; } void setTag(std::size_t index, std::size_t tag) { FOLLY_SAFE_DCHECK( this != emptyInstance() && tag >= 0x80 && tag <= 0xff, ""); FOLLY_SAFE_CHECK(tags_[index] == 0, ""); tags_[index] = static_cast(tag); } void clearTag(std::size_t index) { FOLLY_SAFE_CHECK((tags_[index] & 0x80) != 0, ""); tags_[index] = 0; } #if FOLLY_NEON //////// // Tag filtering using NEON intrinsics SparseMaskIter tagMatchIter(std::size_t needle) const { FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, ""); uint8x16_t tagV = vld1q_u8(&tags_[0]); auto needleV = vdupq_n_u8(static_cast(needle)); auto eqV = vceqq_u8(tagV, needleV); // get info from every byte into the bottom half of every uint16_t // by shifting right 4, then round to get it into a 64-bit vector uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(eqV), 4); uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask; return SparseMaskIter(mask); } MaskType occupiedMask() const { uint8x16_t tagV = vld1q_u8(&tags_[0]); // signed shift extends top bit to all bits auto occupiedV = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(tagV), 7)); uint8x8_t maskV = vshrn_n_u16(vreinterpretq_u16_u8(occupiedV), 4); return vget_lane_u64(vreinterpret_u64_u8(maskV), 0) & kFullMask; } #else //////// // Tag filtering using SSE2 intrinsics TagVector const* tagVector() const { return static_cast(static_cast(&tags_[0])); } SparseMaskIter tagMatchIter(std::size_t needle) const { FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, ""); auto tagV = _mm_load_si128(tagVector()); // TRICKY! It may seem strange to have a std::size_t needle and narrow // it at the last moment, rather than making HashPair::second be a // uint8_t, but the latter choice sometimes leads to a performance // problem. // // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands // to multiple instructions. One of those is a MOVD of either 4 or // 8 byte width. Only the bottom byte of that move actually affects // the result, but if a 1-byte needle has been spilled then this will // be a 4 byte load. GCC 5.5 has been observed to reload needle // (or perhaps fuse a reload and part of a previous static_cast) // needle using a MOVZX with a 1 byte load in parallel with the MOVD. // This combination causes a failure of store-to-load forwarding, // which has a big performance penalty (60 nanoseconds per find on // a microbenchmark). Keeping needle >= 4 bytes avoids the problem // and also happens to result in slightly more compact assembly. auto needleV = _mm_set1_epi8(static_cast(needle)); auto eqV = _mm_cmpeq_epi8(tagV, needleV); auto mask = _mm_movemask_epi8(eqV) & kFullMask; return SparseMaskIter{mask}; } MaskType occupiedMask() const { auto tagV = _mm_load_si128(tagVector()); return _mm_movemask_epi8(tagV) & kFullMask; } #endif DenseMaskIter occupiedIter() const { return DenseMaskIter{&tags_[0], occupiedMask()}; } MaskRangeIter occupiedRangeIter() const { return MaskRangeIter{occupiedMask()}; } LastOccupiedInMask lastOccupied() const { return LastOccupiedInMask{occupiedMask()}; } FirstEmptyInMask firstEmpty() const { return FirstEmptyInMask{occupiedMask() ^ kFullMask}; } bool occupied(std::size_t index) const { FOLLY_SAFE_DCHECK(tags_[index] == 0 || (tags_[index] & 0x80) != 0, ""); return tags_[index] != 0; } Item* itemAddr(std::size_t i) const { return static_cast( const_cast(static_cast(&rawItems_[i]))); } Item& item(std::size_t i) { FOLLY_SAFE_DCHECK(this->occupied(i), ""); return *launder(itemAddr(i)); } Item const& citem(std::size_t i) const { FOLLY_SAFE_DCHECK(this->occupied(i), ""); return *launder(itemAddr(i)); } static F14Chunk& owner(Item& item, std::size_t index) { auto rawAddr = static_cast(static_cast(std::addressof(item))) - offsetof(F14Chunk, rawItems_) - index * sizeof(Item); auto chunkAddr = static_cast(static_cast(rawAddr)); FOLLY_SAFE_DCHECK(std::addressof(item) == chunkAddr->itemAddr(index), ""); return *chunkAddr; } }; //////////////// // PackedChunkItemPtr points to an Item in an F14Chunk, allowing both the // Item& and its index to be recovered. It sorts by the address of the // item, and it only works for items that are in a properly-aligned chunk. // generic form, not actually packed template class PackedChunkItemPtr { public: PackedChunkItemPtr(Ptr p, std::size_t i) noexcept : ptr_{p}, index_{i} { FOLLY_SAFE_DCHECK(ptr_ != nullptr || index_ == 0, ""); } Ptr ptr() const { return ptr_; } std::size_t index() const { return index_; } bool operator<(PackedChunkItemPtr const& rhs) const { FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, ""); return ptr_ < rhs.ptr_; } bool operator==(PackedChunkItemPtr const& rhs) const { FOLLY_SAFE_DCHECK(ptr_ != rhs.ptr_ || index_ == rhs.index_, ""); return ptr_ == rhs.ptr_; } bool operator!=(PackedChunkItemPtr const& rhs) const { return !(*this == rhs); } private: Ptr ptr_; std::size_t index_; }; // Bare pointer form, packed into a uintptr_t. Uses only bits wasted by // alignment, so it works on 32-bit and 64-bit platforms template class PackedChunkItemPtr { static_assert((alignof(F14Chunk) % 16) == 0, ""); // Chunks are 16-byte aligned, so we can maintain a packed pointer to a // chunk item by packing the 4-bit item index into the least significant // bits of a pointer to the chunk itself. This makes ItemIter::pack // more expensive, however, since it has to compute the chunk address. // // Chunk items have varying alignment constraints, so it would seem // to be that we can't do a similar trick while using only bit masking // operations on the Item* itself. It happens to be, however, that if // sizeof(Item) is not a multiple of 16 then we can recover a portion // of the index bits from the knowledge that the Item-s are stored in // an array that is itself 16-byte aligned. // // If kAlignBits is the number of trailing zero bits in sizeof(Item) // (up to 4), then we can borrow those bits to store kAlignBits of the // index directly. We can recover (4 - kAlignBits) bits of the index // from the item pointer itself, by defining/observing that // // A = kAlignBits (A <= 4) // // S = (sizeof(Item) % 16) >> A (shifted-away bits are all zero) // // R = (itemPtr % 16) >> A (shifted-away bits are all zero) // // M = 16 >> A // // itemPtr % 16 = (index * sizeof(Item)) % 16 // // (R * 2^A) % 16 = (index * (sizeof(Item) % 16)) % 16 // // (R * 2^A) % 16 = (index * 2^A * S) % 16 // // R % M = (index * S) % M // // S is relatively prime with M, so a multiplicative inverse is easy // to compute // // Sinv = S^(M - 1) % M // // (R * Sinv) % M = index % M // // This lets us recover the bottom bits of the index. When sizeof(T) // is 8-byte aligned kSizeInverse will always be 1. When sizeof(T) // is 4-byte aligned kSizeInverse will be either 1 or 3. // returns pow(x, y) % m static constexpr uintptr_t powerMod(uintptr_t x, uintptr_t y, uintptr_t m) { return y == 0 ? 1 : (x * powerMod(x, y - 1, m)) % m; } static constexpr uintptr_t kIndexBits = 4; static constexpr uintptr_t kIndexMask = (uintptr_t{1} << kIndexBits) - 1; static constexpr uintptr_t kAlignBits = constexpr_min( uintptr_t{4}, constexpr_find_first_set(uintptr_t{sizeof(T)}) - 1); static constexpr uintptr_t kAlignMask = (uintptr_t{1} << kAlignBits) - 1; static constexpr uintptr_t kModulus = uintptr_t{1} << (kIndexBits - kAlignBits); static constexpr uintptr_t kSizeInverse = powerMod(sizeof(T) >> kAlignBits, kModulus - 1, kModulus); public: PackedChunkItemPtr(T* p, std::size_t i) noexcept { uintptr_t encoded = i >> (kIndexBits - kAlignBits); assume((encoded & ~kAlignMask) == 0); raw_ = reinterpret_cast(p) | encoded; FOLLY_SAFE_DCHECK(p == ptr(), ""); FOLLY_SAFE_DCHECK(i == index(), ""); } T* ptr() const { return reinterpret_cast(raw_ & ~kAlignMask); } std::size_t index() const { auto encoded = (raw_ & kAlignMask) << (kIndexBits - kAlignBits); auto deduced = ((raw_ >> kAlignBits) * kSizeInverse) & (kIndexMask >> kAlignBits); return encoded | deduced; } bool operator<(PackedChunkItemPtr const& rhs) const { return raw_ < rhs.raw_; } bool operator==(PackedChunkItemPtr const& rhs) const { return raw_ == rhs.raw_; } bool operator!=(PackedChunkItemPtr const& rhs) const { return !(*this == rhs); } private: uintptr_t raw_; }; template class F14ItemIter { private: using Chunk = typename std::pointer_traits::element_type; public: using Item = typename Chunk::Item; using ItemPtr = typename std::pointer_traits::template rebind; using ItemConstPtr = typename std::pointer_traits::template rebind; using Packed = PackedChunkItemPtr; //// PUBLIC F14ItemIter() noexcept : itemPtr_{nullptr}, index_{0} {} // default copy and move constructors and assignment operators are correct explicit F14ItemIter(Packed const& packed) : itemPtr_{packed.ptr()}, index_{packed.index()} {} F14ItemIter(ChunkPtr chunk, std::size_t index) : itemPtr_{std::pointer_traits::pointer_to(chunk->item(index))}, index_{index} { FOLLY_SAFE_DCHECK(index < Chunk::kCapacity, ""); assume( std::pointer_traits::pointer_to(chunk->item(index)) != nullptr); assume(itemPtr_ != nullptr); } FOLLY_ALWAYS_INLINE void advanceImpl(bool checkEof, bool likelyDead) { auto c = chunk(); // common case is packed entries while (index_ > 0) { --index_; --itemPtr_; if (LIKELY(c->occupied(index_))) { return; } } // It's fairly common for an iterator to be advanced and then become // dead, for example in the return value from erase(iter) or in // the last step of a loop. We'd like to make sure that the entire // advance() method can be eliminated by the compiler's dead code // elimination pass. To do that it must eliminate the loops, which // requires it to prove that they have no side effects. It's easy // to show that there are no escaping stores, but at the moment // compilers also consider an infinite loop to be a side effect. // (There are parts of the standard that would allow them to treat // this as undefined behavior, but at the moment they don't exploit // those clauses.) // // The following loop should really be a while loop, which would // save a register, some instructions, and a conditional branch, // but by writing it as a for loop the compiler can prove to itself // that it will eventually terminate. (No matter that even if the // loop executed in a single cycle it would take about 200 years to // run all 2^64 iterations.) // // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82776 has the bug we // filed about the issue. while (true) { for (std::size_t i = 1; !likelyDead || i != 0; ++i) { if (checkEof) { // exhausted the current chunk if (UNLIKELY(c->eof())) { FOLLY_SAFE_DCHECK(index_ == 0, ""); itemPtr_ = nullptr; return; } } else { FOLLY_SAFE_DCHECK(!c->eof(), ""); } --c; auto last = c->lastOccupied(); if (checkEof && !likelyDead) { prefetchAddr(&*c - 1); } if (LIKELY(last.hasIndex())) { index_ = last.index(); itemPtr_ = std::pointer_traits::pointer_to(c->item(index_)); return; } } } void precheckedAdvance() { advanceImpl(false, false); } FOLLY_ALWAYS_INLINE void advance() { advanceImpl(true, false); } FOLLY_ALWAYS_INLINE void advanceLikelyDead() { advanceImpl(true, true); } ChunkPtr chunk() const { return std::pointer_traits::pointer_to( Chunk::owner(*itemPtr_, index_)); } std::size_t index() const { return index_; } Item* itemAddr() const { return std::addressof(*itemPtr_); } Item& item() const { return *itemPtr_; } Item const& citem() const { return *itemPtr_; } bool atEnd() const { return itemPtr_ == nullptr; } Packed pack() const { return Packed{itemPtr_, static_cast(index_)}; } bool operator==(F14ItemIter const& rhs) const { // this form makes iter == end() into a single null check after inlining // and constant propagation return itemPtr_ == rhs.itemPtr_; } bool operator!=(F14ItemIter const& rhs) const { return !(*this == rhs); } private: ItemPtr itemPtr_; std::size_t index_; }; //////////////// template struct SizeAndPackedBegin { SizeType size_{0}; private: typename ItemIter::Packed packedBegin_{ItemIter{}.pack()}; public: typename ItemIter::Packed& packedBegin() { return packedBegin_; } typename ItemIter::Packed const& packedBegin() const { return packedBegin_; } }; template struct SizeAndPackedBegin { SizeType size_{0}; [[noreturn]] typename ItemIter::Packed& packedBegin() { assume_unreachable(); } [[noreturn]] typename ItemIter::Packed const& packedBegin() const { assume_unreachable(); } }; template class F14Table : public Policy { public: using Item = typename Policy::Item; using value_type = typename Policy::Value; using allocator_type = typename Policy::Alloc; private: using Alloc = typename Policy::Alloc; using AllocTraits = typename Policy::AllocTraits; using Hasher = typename Policy::Hasher; using InternalSizeType = typename Policy::InternalSizeType; using KeyEqual = typename Policy::KeyEqual; using Policy::kAllocIsAlwaysEqual; using Policy::kContinuousCapacity; using Policy::kDefaultConstructIsNoexcept; using Policy::kEnableItemIteration; using Policy::kSwapIsNoexcept; using Policy::destroyItemOnClear; using Policy::isAvalanchingHasher; using Policy::prefetchBeforeCopy; using Policy::prefetchBeforeDestroy; using Policy::prefetchBeforeRehash; using ByteAlloc = typename AllocTraits::template rebind_alloc; using BytePtr = typename std::allocator_traits::pointer; using Chunk = F14Chunk; using ChunkPtr = typename std::pointer_traits::template rebind; using HashPair = typename F14HashToken::HashPair; public: using ItemIter = F14ItemIter; private: //////// begin fields ChunkPtr chunks_{Chunk::emptyInstance()}; InternalSizeType chunkMask_{0}; SizeAndPackedBegin sizeAndPackedBegin_; //////// end fields void swapContents(F14Table& rhs) noexcept { using std::swap; swap(chunks_, rhs.chunks_); swap(chunkMask_, rhs.chunkMask_); swap(sizeAndPackedBegin_.size_, rhs.sizeAndPackedBegin_.size_); if (kEnableItemIteration) { swap( sizeAndPackedBegin_.packedBegin(), rhs.sizeAndPackedBegin_.packedBegin()); } } public: F14Table( std::size_t initialCapacity, Hasher const& hasher, KeyEqual const& keyEqual, Alloc const& alloc) : Policy{hasher, keyEqual, alloc} { if (initialCapacity > 0) { reserve(initialCapacity); } } F14Table(F14Table const& rhs) : Policy{rhs} { buildFromF14Table(rhs); } F14Table(F14Table const& rhs, Alloc const& alloc) : Policy{rhs, alloc} { buildFromF14Table(rhs); } F14Table(F14Table&& rhs) noexcept( std::is_nothrow_move_constructible::value&& std::is_nothrow_move_constructible::value&& std::is_nothrow_move_constructible::value) : Policy{std::move(rhs)} { swapContents(rhs); } F14Table(F14Table&& rhs, Alloc const& alloc) noexcept(kAllocIsAlwaysEqual) : Policy{std::move(rhs), alloc} { if (kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) { // move storage (common case) swapContents(rhs); } else { // new storage because allocators unequal, move values (rare case) buildFromF14Table(std::move(rhs)); } } F14Table& operator=(F14Table const& rhs) { if (this != &rhs) { reset(); static_cast(*this) = rhs; buildFromF14Table(rhs); } return *this; } F14Table& operator=(F14Table&& rhs) noexcept( std::is_nothrow_move_assignable::value&& std::is_nothrow_move_assignable::value && (kAllocIsAlwaysEqual || (AllocTraits::propagate_on_container_move_assignment::value && std::is_nothrow_move_assignable::value))) { if (this != &rhs) { reset(); static_cast(*this) = std::move(rhs); if (AllocTraits::propagate_on_container_move_assignment::value || kAllocIsAlwaysEqual || this->alloc() == rhs.alloc()) { // move storage (common case) swapContents(rhs); } else { // new storage because allocators unequal, move values (rare case) buildFromF14Table(std::move(rhs)); } } return *this; } ~F14Table() { reset(); } void swap(F14Table& rhs) noexcept(kSwapIsNoexcept) { // If propagate_on_container_swap is false and allocators are // not equal, the only way to accomplish a swap would be to do // dynamic allocation and then move (or swap) each contained value. // AllocatorAwareContainer-s are not supposed to attempt this, but // rather are supposed to have undefined behavior in that case. FOLLY_SAFE_CHECK( AllocTraits::propagate_on_container_swap::value || kAllocIsAlwaysEqual || this->alloc() == rhs.alloc(), "swap is undefined for unequal non-propagating allocators"); this->swapPolicy(rhs); swapContents(rhs); } private: //////// hash helpers // Hash values are used to compute the desired position, which is the // chunk index at which we would like to place a value (if there is no // overflow), and the tag, which is an additional 7 bits of entropy. // // The standard's definition of hash function quality only refers to // the probability of collisions of the entire hash value, not to the // probability of collisions of the results of shifting or masking the // hash value. Some hash functions, however, provide this stronger // guarantee (not quite the same as the definition of avalanching, // but similar). // // If the user-supplied hasher is an avalanching one (each bit of the // hash value has a 50% chance of being the same for differing hash // inputs), then we can just take 7 bits of the hash value for the tag // and the rest for the desired position. Avalanching hashers also // let us map hash value to array index position with just a bitmask // without risking clumping. (Many hash tables just accept the risk // and do it regardless.) // // std::hash avalanches in all implementations we've // examined: libstdc++-v3 uses MurmurHash2, and libc++ uses CityHash // or MurmurHash2. The other std::hash specializations, however, do not // have this property. std::hash for integral and pointer values is the // identity function on libstdc++-v3 and libc++, in particular. In our // experience it is also fairly common for user-defined specializations // of std::hash to combine fields in an ad-hoc way that does not evenly // distribute entropy among the bits of the result (a + 37 * b, for // example, where a and b are integer fields). // // For hash functions we don't trust to avalanche, we repair things by // applying a bit mixer to the user-supplied hash. #if FOLLY_X64 || FOLLY_AARCH64 // 64-bit static HashPair splitHash(std::size_t hash) { static_assert(sizeof(std::size_t) == sizeof(uint64_t), ""); std::size_t tag; if (!isAvalanchingHasher()) { #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE #if FOLLY_SSE_PREREQ(4, 2) // SSE4.2 CRC std::size_t c = _mm_crc32_u64(0, hash); tag = (c >> 24) | 0x80; hash += c; #else // CRC is optional on armv8 (-march=armv8-a+crc), standard on armv8.1 std::size_t c = __crc32cd(0, hash); tag = (c >> 24) | 0x80; hash += c; #endif #else // The mixer below is not fully avalanching for all 64 bits of // output, but looks quite good for bits 18..63 and puts plenty // of entropy even lower when considering multiple bits together // (like the tag). Importantly, when under register pressure it // uses fewer registers, instructions, and immediate constants // than the alternatives, resulting in compact code that is more // easily inlinable. In one instantiation a modified Murmur mixer // was 48 bytes of assembly (even after using the same multiplicand // for both steps) and this one was 27 bytes, for example. auto const kMul = 0xc4ceb9fe1a85ec53ULL; #ifdef _WIN32 __int64 signedHi; __int64 signedLo = _mul128( static_cast<__int64>(hash), static_cast<__int64>(kMul), &signedHi); auto hi = static_cast(signedHi); auto lo = static_cast(signedLo); #else auto hi = static_cast( (static_cast(hash) * kMul) >> 64); auto lo = hash * kMul; #endif hash = hi ^ lo; hash *= kMul; tag = ((hash >> 15) & 0x7f) | 0x80; hash >>= 22; #endif } else { // we don't trust the top bit tag = (hash >> 56) | 0x80; } return std::make_pair(hash, tag); } #else // 32-bit static HashPair splitHash(std::size_t hash) { static_assert(sizeof(std::size_t) == sizeof(uint32_t), ""); uint8_t tag; if (!isAvalanchingHasher()) { #if FOLLY_F14_CRC_INTRINSIC_AVAILABLE #if FOLLY_SSE_PREREQ(4, 2) // SSE4.2 CRC auto c = _mm_crc32_u32(0, hash); tag = static_cast(~(c >> 25)); hash += c; #else auto c = __crc32cw(0, hash); tag = static_cast(~(c >> 25)); hash += c; #endif #else // finalizer for 32-bit murmur2 hash ^= hash >> 13; hash *= 0x5bd1e995; hash ^= hash >> 15; tag = static_cast(~(hash >> 25)); #endif } else { // we don't trust the top bit tag = (hash >> 24) | 0x80; } return std::make_pair(hash, tag); } #endif //////// memory management helpers static std::size_t computeCapacity( std::size_t chunkCount, std::size_t scale) { FOLLY_SAFE_DCHECK(!(chunkCount > 1 && scale == 0), ""); FOLLY_SAFE_DCHECK( scale < (std::size_t{1} << Chunk::kCapacityScaleBits), ""); FOLLY_SAFE_DCHECK((chunkCount & (chunkCount - 1)) == 0, ""); return (((chunkCount - 1) >> Chunk::kCapacityScaleShift) + 1) * scale; } std::pair computeChunkCountAndScale( std::size_t desiredCapacity, bool continuousSingleChunkCapacity, bool continuousMultiChunkCapacity) const { if (desiredCapacity <= Chunk::kCapacity) { // we can go to 100% capacity in a single chunk with no problem if (!continuousSingleChunkCapacity) { if (desiredCapacity <= 2) { desiredCapacity = 2; } else if (desiredCapacity <= 6) { desiredCapacity = 6; } else { desiredCapacity = Chunk::kCapacity; } } auto rv = std::make_pair(std::size_t{1}, desiredCapacity); FOLLY_SAFE_DCHECK( computeCapacity(rv.first, rv.second) == desiredCapacity, ""); return rv; } else { std::size_t minChunks = (desiredCapacity - 1) / Chunk::kDesiredCapacity + 1; std::size_t chunkPow = findLastSet(minChunks - 1); if (chunkPow == 8 * sizeof(std::size_t)) { throw_exception(); } std::size_t chunkCount = std::size_t{1} << chunkPow; // Let cc * scale be the actual capacity. // cc = ((chunkCount - 1) >> kCapacityScaleShift) + 1. // If chunkPow >= kCapacityScaleShift, then cc = chunkCount >> // kCapacityScaleShift = 1 << (chunkPow - kCapacityScaleShift), // otherwise it equals 1 = 1 << 0. Let cc = 1 << ss. std::size_t ss = chunkPow >= Chunk::kCapacityScaleShift ? chunkPow - Chunk::kCapacityScaleShift : 0; std::size_t scale; if (continuousMultiChunkCapacity) { // (1 << ss) * scale >= desiredCapacity scale = ((desiredCapacity - 1) >> ss) + 1; } else { // (1 << ss) * scale == chunkCount * kDesiredCapacity scale = Chunk::kDesiredCapacity << (chunkPow - ss); } std::size_t actualCapacity = computeCapacity(chunkCount, scale); FOLLY_SAFE_DCHECK(actualCapacity >= desiredCapacity, ""); if (actualCapacity > max_size()) { throw_exception(); } return std::make_pair(chunkCount, scale); } } static std::size_t chunkAllocSize( std::size_t chunkCount, std::size_t capacityScale) { FOLLY_SAFE_DCHECK(chunkCount > 0, ""); FOLLY_SAFE_DCHECK(!(chunkCount > 1 && capacityScale == 0), ""); if (chunkCount == 1) { static_assert(offsetof(Chunk, rawItems_) == 16, ""); return 16 + sizeof(Item) * computeCapacity(1, capacityScale); } else { return sizeof(Chunk) * chunkCount; } } ChunkPtr initializeChunks( BytePtr raw, std::size_t chunkCount, std::size_t capacityScale) { static_assert(std::is_trivial::value, "F14Chunk should be POD"); auto chunks = static_cast(static_cast(&*raw)); for (std::size_t i = 0; i < chunkCount; ++i) { chunks[i].clear(); } chunks[0].markEof(capacityScale); return std::pointer_traits::pointer_to(*chunks); } std::size_t itemCount() const noexcept { if (chunkMask_ == 0) { return computeCapacity(1, chunks_->capacityScale()); } else { return (chunkMask_ + 1) * Chunk::kCapacity; } } public: ItemIter begin() const noexcept { FOLLY_SAFE_DCHECK(kEnableItemIteration, ""); return ItemIter{sizeAndPackedBegin_.packedBegin()}; } ItemIter end() const noexcept { return ItemIter{}; } bool empty() const noexcept { return size() == 0; } InternalSizeType size() const noexcept { return sizeAndPackedBegin_.size_; } std::size_t max_size() const noexcept { auto& a = this->alloc(); return std::min( (std::numeric_limits::max)(), AllocTraits::max_size(a)); } std::size_t bucket_count() const noexcept { return computeCapacity(chunkMask_ + 1, chunks_->capacityScale()); } std::size_t max_bucket_count() const noexcept { return max_size(); } float load_factor() const noexcept { return empty() ? 0.0f : static_cast(size()) / static_cast(bucket_count()); } float max_load_factor() const noexcept { return 1.0f; } void max_load_factor(float) noexcept { // Probing hash tables can't run load factors >= 1 (unlike chaining // tables). In addition, we have measured that there is little or // no performance advantage to running a smaller load factor (cache // locality losses outweigh the small reduction in probe lengths, // often making it slower). Therefore, we've decided to just fix // max_load_factor at 1.0f regardless of what the user requests. // This has an additional advantage that we don't have to store it. // Taking alignment into consideration this makes every F14 table // 8 bytes smaller, and is part of the reason an empty F14NodeMap // is almost half the size of an empty std::unordered_map (32 vs // 56 bytes). // // I don't have a strong opinion on whether we should remove this // method or leave a stub, let ngbronson or xshi know if you have a // compelling argument either way. } private: // Our probe strategy is to advance through additional chunks with // a stride that is key-specific. This is called double hashing, // and is a well known and high quality probing strategy. So long as // the stride and the chunk count are relatively prime, we will visit // every chunk once and then return to the original chunk, letting us // detect and end the cycle. The chunk count is a power of two, so // we can satisfy the relatively prime part by choosing an odd stride. // We've already computed a high quality secondary hash value for the // tag, so we just use it for the second probe hash as well. // // At the maximum load factor of 12/14, expected probe length for a // find hit is 1.041, with 99% of keys found in the first three chunks. // Expected probe length for a find miss (or insert) is 1.275, with a // p99 probe length of 4 (fewer than 1% of failing find look at 5 or // more chunks). // // This code is structured so you can try various ways of encoding // the current probe state. For example, at the moment the probe's // state is the position in the cycle and the resulting chunk index is // computed from that inside probeCurrentIndex. We could also make the // probe state the chunk index, and then increment it by hp.second * // 2 + 1 in probeAdvance. Wrapping can be applied early or late as // well. This particular code seems to be easier for the optimizer // to understand. // // We could also implement probing strategies that resulted in the same // tour for every key initially assigned to a chunk (linear probing or // quadratic), but that results in longer probe lengths. In particular, // the cache locality wins of linear probing are not worth the increase // in probe lengths (extra work and less branch predictability) in // our experiments. std::size_t probeDelta(HashPair hp) const { return 2 * hp.second + 1; } template FOLLY_ALWAYS_INLINE ItemIter findImpl(HashPair hp, K const& key) const { std::size_t index = hp.first; std::size_t step = probeDelta(hp); for (std::size_t tries = 0; tries <= chunkMask_; ++tries) { ChunkPtr chunk = chunks_ + (index & chunkMask_); if (sizeof(Chunk) > 64) { prefetchAddr(chunk->itemAddr(8)); } auto hits = chunk->tagMatchIter(hp.second); while (hits.hasNext()) { auto i = hits.next(); if (LIKELY(this->keyMatchesItem(key, chunk->item(i)))) { // Tag match and key match were both successful. The chance // of a false tag match is 1/128 for each key in the chunk // (with a proper hash function). return ItemIter{chunk, i}; } } if (LIKELY(chunk->outboundOverflowCount() == 0)) { // No keys that wanted to be placed in this chunk were denied // entry, so our search is over. This is the common case. break; } index += step; } // Loop exit because tries is exhausted is rare, but possible. // That means that for every chunk there is currently a key present // in the map that visited that chunk on its probe search but ended // up somewhere else, and we have searched every chunk. return ItemIter{}; } public: // Prehashing splits the work of find(key) into two calls, enabling you // to manually implement loop pipelining for hot bulk lookups. prehash // computes the hash and prefetches the first computed memory location, // and the two-arg find(F14HashToken,K) performs the rest of the search. template F14HashToken prehash(K const& key) const { FOLLY_SAFE_DCHECK(chunks_ != nullptr, ""); auto hp = splitHash(this->computeKeyHash(key)); ChunkPtr firstChunk = chunks_ + (hp.first & chunkMask_); prefetchAddr(firstChunk); return F14HashToken(std::move(hp)); } template FOLLY_ALWAYS_INLINE ItemIter find(K const& key) const { auto hp = splitHash(this->computeKeyHash(key)); return findImpl(hp, key); } template FOLLY_ALWAYS_INLINE ItemIter find(F14HashToken const& token, K const& key) const { FOLLY_SAFE_DCHECK( splitHash(this->computeKeyHash(key)) == static_cast(token), ""); return findImpl(static_cast(token), key); } // Searches for a key using a key predicate that is a refinement // of key equality. func(k) should return true only if k is equal // to key according to key_eq(), but is allowed to apply additional // constraints. template FOLLY_ALWAYS_INLINE ItemIter findMatching(K const& key, F&& func) const { auto hp = splitHash(this->computeKeyHash(key)); std::size_t index = hp.first; std::size_t step = probeDelta(hp); for (std::size_t tries = 0; tries <= chunkMask_; ++tries) { ChunkPtr chunk = chunks_ + (index & chunkMask_); if (sizeof(Chunk) > 64) { prefetchAddr(chunk->itemAddr(8)); } auto hits = chunk->tagMatchIter(hp.second); while (hits.hasNext()) { auto i = hits.next(); if (LIKELY( func(this->keyForValue(this->valueAtItem(chunk->item(i)))))) { return ItemIter{chunk, i}; } } if (LIKELY(chunk->outboundOverflowCount() == 0)) { break; } index += step; } return ItemIter{}; } private: void adjustSizeAndBeginAfterInsert(ItemIter iter) { if (kEnableItemIteration) { // packedBegin is the max of all valid ItemIter::pack() auto packed = iter.pack(); if (sizeAndPackedBegin_.packedBegin() < packed) { sizeAndPackedBegin_.packedBegin() = packed; } } ++sizeAndPackedBegin_.size_; } // Ignores hp if pos.chunk()->hostedOverflowCount() == 0 void eraseBlank(ItemIter iter, HashPair hp) { iter.chunk()->clearTag(iter.index()); if (iter.chunk()->hostedOverflowCount() != 0) { // clean up std::size_t index = hp.first; std::size_t delta = probeDelta(hp); uint8_t hostedOp = 0; while (true) { ChunkPtr chunk = chunks_ + (index & chunkMask_); if (chunk == iter.chunk()) { chunk->adjustHostedOverflowCount(hostedOp); break; } chunk->decrOutboundOverflowCount(); hostedOp = Chunk::kDecrHostedOverflowCount; index += delta; } } } void adjustSizeAndBeginBeforeErase(ItemIter iter) { --sizeAndPackedBegin_.size_; if (kEnableItemIteration) { if (iter.pack() == sizeAndPackedBegin_.packedBegin()) { if (size() == 0) { iter = ItemIter{}; } else { iter.precheckedAdvance(); } sizeAndPackedBegin_.packedBegin() = iter.pack(); } } } template void insertAtBlank(ItemIter pos, HashPair hp, Args&&... args) { try { auto dst = pos.itemAddr(); this->constructValueAtItem(*this, dst, std::forward(args)...); } catch (...) { eraseBlank(pos, hp); throw; } adjustSizeAndBeginAfterInsert(pos); } ItemIter allocateTag(uint8_t* fullness, HashPair hp) { ChunkPtr chunk; std::size_t index = hp.first; std::size_t delta = probeDelta(hp); uint8_t hostedOp = 0; while (true) { index &= chunkMask_; chunk = chunks_ + index; if (LIKELY(fullness[index] < Chunk::kCapacity)) { break; } chunk->incrOutboundOverflowCount(); hostedOp = Chunk::kIncrHostedOverflowCount; index += delta; } unsigned itemIndex = fullness[index]++; FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), ""); chunk->setTag(itemIndex, hp.second); chunk->adjustHostedOverflowCount(hostedOp); return ItemIter{chunk, itemIndex}; } ChunkPtr lastOccupiedChunk() const { FOLLY_SAFE_DCHECK(size() > 0, ""); if (kEnableItemIteration) { return begin().chunk(); } else { return chunks_ + chunkMask_; } } template void directBuildFrom(T&& src) { FOLLY_SAFE_DCHECK(src.size() > 0 && chunkMask_ == src.chunkMask_, ""); // We use std::forward to allow portions of src to be moved out by // either beforeBuild or afterBuild, but we are just relying on good // behavior of our Policy superclass to ensure that any particular // field of this is a donor at most once. auto undoState = this->beforeBuild(src.size(), bucket_count(), std::forward(src)); bool success = false; SCOPE_EXIT { this->afterBuild( undoState, success, src.size(), bucket_count(), std::forward(src)); }; // Copy can fail part-way through if a Value copy constructor throws. // Failing afterBuild is limited in its cleanup power in this case, // because it can't enumerate the items that were actually copied. // Fortunately we can divide the situation into cases where all of // the state is owned by the table itself (F14Node and F14Value), // for which clearImpl() can do partial cleanup, and cases where all // of the values are owned by the policy (F14Vector), in which case // partial failure should not occur. Sorry for the subtle invariants // in the Policy API. if (is_trivially_copyable::value && !this->destroyItemOnClear() && itemCount() == src.itemCount()) { FOLLY_SAFE_DCHECK(chunkMask_ == src.chunkMask_, ""); auto scale = chunks_->capacityScale(); // most happy path auto n = chunkAllocSize(chunkMask_ + 1, scale); std::memcpy(&chunks_[0], &src.chunks_[0], n); sizeAndPackedBegin_.size_ = src.size(); if (kEnableItemIteration) { auto srcBegin = src.begin(); sizeAndPackedBegin_.packedBegin() = ItemIter{chunks_ + (srcBegin.chunk() - src.chunks_), srcBegin.index()} .pack(); } if (kContinuousCapacity) { // capacityScale might not match even if itemCount matches chunks_->setCapacityScale(scale); } } else { // Happy path, no rehash but pack items toward bottom of chunk // and use copy constructor. Don't try to optimize by using // lastOccupiedChunk() because there may be higher unoccupied chunks // with the overflow bit set. auto srcChunk = &src.chunks_[chunkMask_]; Chunk* dstChunk = &chunks_[chunkMask_]; do { dstChunk->copyOverflowInfoFrom(*srcChunk); auto iter = srcChunk->occupiedIter(); if (prefetchBeforeCopy()) { for (auto piter = iter; piter.hasNext();) { this->prefetchValue(srcChunk->citem(piter.next())); } } std::size_t dstI = 0; for (; iter.hasNext(); ++dstI) { auto srcI = iter.next(); auto&& srcArg = std::forward(src).buildArgForItem(srcChunk->item(srcI)); auto dst = dstChunk->itemAddr(dstI); this->constructValueAtItem( 0, dst, std::forward(srcArg)); dstChunk->setTag(dstI, srcChunk->tag(srcI)); ++sizeAndPackedBegin_.size_; } --srcChunk; --dstChunk; } while (size() != src.size()); // reset doesn't care about packedBegin, so we don't fix it until the end if (kEnableItemIteration) { std::size_t maxChunkIndex = src.lastOccupiedChunk() - src.chunks_; sizeAndPackedBegin_.packedBegin() = ItemIter{chunks_ + maxChunkIndex, chunks_[maxChunkIndex].lastOccupied().index()} .pack(); } } success = true; } template void rehashBuildFrom(T&& src) { FOLLY_SAFE_DCHECK(src.chunkMask_ > chunkMask_, ""); // 1 byte per chunk means < 1 bit per value temporary overhead std::array stackBuf; uint8_t* fullness; auto cc = chunkMask_ + 1; if (cc <= stackBuf.size()) { fullness = stackBuf.data(); } else { ByteAlloc a{this->alloc()}; fullness = &*std::allocator_traits::allocate(a, cc); } SCOPE_EXIT { if (cc > stackBuf.size()) { ByteAlloc a{this->alloc()}; std::allocator_traits::deallocate( a, std::pointer_traits::pointer>::pointer_to(*fullness), cc); } }; std::memset(fullness, '\0', cc); // We use std::forward to allow portions of src to be moved out by // either beforeBuild or afterBuild, but we are just relying on good // behavior of our Policy superclass to ensure that any particular // field of this is a donor at most once. // Exception safety requires beforeBuild to happen after all of the // allocate() calls. auto undoState = this->beforeBuild(src.size(), bucket_count(), std::forward(src)); bool success = false; SCOPE_EXIT { this->afterBuild( undoState, success, src.size(), bucket_count(), std::forward(src)); }; // The current table is at a valid state at all points for policies // in which non-trivial values are owned by the main table (F14Node // and F14Value), so reset() will clean things up properly if we // fail partway through. For the case that the policy manages value // lifecycle (F14Vector) then nothing after beforeBuild can throw and // we don't have to worry about partial failure. std::size_t srcChunkIndex = src.lastOccupiedChunk() - src.chunks_; while (true) { auto srcChunk = &src.chunks_[srcChunkIndex]; auto iter = srcChunk->occupiedIter(); if (prefetchBeforeRehash()) { for (auto piter = iter; piter.hasNext();) { this->prefetchValue(srcChunk->item(piter.next())); } } if (srcChunk->hostedOverflowCount() == 0) { // all items are in their preferred chunk (no probing), so we // don't need to compute any hash values while (iter.hasNext()) { auto i = iter.next(); auto& srcItem = srcChunk->item(i); auto&& srcArg = std::forward(src).buildArgForItem(srcItem); HashPair hp{srcChunkIndex, srcChunk->tag(i)}; insertAtBlank( allocateTag(fullness, hp), hp, std::forward(srcArg)); } } else { // any chunk's items might be in here while (iter.hasNext()) { auto i = iter.next(); auto& srcItem = srcChunk->item(i); auto&& srcArg = std::forward(src).buildArgForItem(srcItem); auto const& srcKey = src.keyForValue(srcArg); auto hp = splitHash(this->computeKeyHash(srcKey)); FOLLY_SAFE_CHECK(hp.second == srcChunk->tag(i), ""); insertAtBlank( allocateTag(fullness, hp), hp, std::forward(srcArg)); } } if (srcChunkIndex == 0) { break; } --srcChunkIndex; } success = true; } template FOLLY_NOINLINE void buildFromF14Table(T&& src) { FOLLY_SAFE_DCHECK(bucket_count() == 0, ""); if (src.size() == 0) { return; } // Use the source's capacity, unless it is oversized. auto upperLimit = computeChunkCountAndScale(src.size(), false, false); auto ccas = std::make_pair( std::size_t{src.chunkMask_} + 1, src.chunks_->capacityScale()); FOLLY_SAFE_DCHECK( ccas.first >= upperLimit.first, "rounded chunk count can't be bigger than actual"); if (ccas.first > upperLimit.first || ccas.second > upperLimit.second) { ccas = upperLimit; } rehashImpl(0, 1, 0, ccas.first, ccas.second); try { if (chunkMask_ == src.chunkMask_) { directBuildFrom(std::forward(src)); } else { rehashBuildFrom(std::forward(src)); } } catch (...) { reset(); F14LinkCheck::check(); throw; } } void reserveImpl(std::size_t desiredCapacity) { desiredCapacity = std::max(desiredCapacity, size()); if (desiredCapacity == 0) { reset(); return; } auto origChunkCount = chunkMask_ + 1; auto origCapacityScale = chunks_->capacityScale(); auto origCapacity = computeCapacity(origChunkCount, origCapacityScale); // This came from an explicit reserve() or rehash() call, so there's // a good chance the capacity is exactly right. To avoid O(n^2) // behavior, we don't do rehashes that decrease the size by less // than 1/8, and if we have a requested increase of less than 1/8 we // instead go to the next power of two. if (desiredCapacity <= origCapacity && desiredCapacity >= origCapacity - origCapacity / 8) { return; } bool attemptExact = !(desiredCapacity > origCapacity && desiredCapacity < origCapacity + origCapacity / 8); std::size_t newChunkCount; std::size_t newCapacityScale; std::tie(newChunkCount, newCapacityScale) = computeChunkCountAndScale( desiredCapacity, attemptExact, kContinuousCapacity && attemptExact); auto newCapacity = computeCapacity(newChunkCount, newCapacityScale); if (origCapacity != newCapacity) { rehashImpl( size(), origChunkCount, origCapacityScale, newChunkCount, newCapacityScale); } } FOLLY_NOINLINE void reserveForInsertImpl( std::size_t capacityMinusOne, std::size_t origChunkCount, std::size_t origCapacityScale, std::size_t origCapacity) { FOLLY_SAFE_DCHECK(capacityMinusOne >= size(), ""); std::size_t capacity = capacityMinusOne + 1; // we want to grow by between 2^0.5 and 2^1.5 ending at a "good" // size, so we grow by 2^0.5 and then round up // 1.01101_2 = 1.40625 std::size_t minGrowth = origCapacity + (origCapacity >> 2) + (origCapacity >> 3) + (origCapacity >> 5); capacity = std::max(capacity, minGrowth); std::size_t newChunkCount; std::size_t newCapacityScale; std::tie(newChunkCount, newCapacityScale) = computeChunkCountAndScale(capacity, false, false); FOLLY_SAFE_DCHECK( computeCapacity(newChunkCount, newCapacityScale) > origCapacity, ""); rehashImpl( size(), origChunkCount, origCapacityScale, newChunkCount, newCapacityScale); } void rehashImpl( std::size_t origSize, std::size_t origChunkCount, std::size_t origCapacityScale, std::size_t newChunkCount, std::size_t newCapacityScale) { auto origChunks = chunks_; auto origCapacity = computeCapacity(origChunkCount, origCapacityScale); auto origAllocSize = chunkAllocSize(origChunkCount, origCapacityScale); auto newCapacity = computeCapacity(newChunkCount, newCapacityScale); auto newAllocSize = chunkAllocSize(newChunkCount, newCapacityScale); BytePtr rawAllocation; auto undoState = this->beforeRehash( origSize, origCapacity, newCapacity, newAllocSize, rawAllocation); chunks_ = initializeChunks(rawAllocation, newChunkCount, newCapacityScale); FOLLY_SAFE_DCHECK( newChunkCount < std::numeric_limits::max(), ""); chunkMask_ = static_cast(newChunkCount - 1); bool success = false; SCOPE_EXIT { // this SCOPE_EXIT reverts chunks_ and chunkMask_ if necessary BytePtr finishedRawAllocation = nullptr; std::size_t finishedAllocSize = 0; if (LIKELY(success)) { if (origCapacity > 0) { finishedRawAllocation = std::pointer_traits::pointer_to( *static_cast(static_cast(&*origChunks))); finishedAllocSize = origAllocSize; } } else { finishedRawAllocation = rawAllocation; finishedAllocSize = newAllocSize; chunks_ = origChunks; FOLLY_SAFE_DCHECK( origChunkCount < std::numeric_limits::max(), ""); chunkMask_ = static_cast(origChunkCount - 1); F14LinkCheck::check(); } this->afterRehash( std::move(undoState), success, origSize, origCapacity, newCapacity, finishedRawAllocation, finishedAllocSize); }; if (origSize == 0) { // nothing to do } else if (origChunkCount == 1 && newChunkCount == 1) { // no mask, no chunk scan, no hash computation, no probing auto srcChunk = origChunks; auto dstChunk = chunks_; std::size_t srcI = 0; std::size_t dstI = 0; while (dstI < origSize) { if (LIKELY(srcChunk->occupied(srcI))) { dstChunk->setTag(dstI, srcChunk->tag(srcI)); this->moveItemDuringRehash( dstChunk->itemAddr(dstI), srcChunk->item(srcI)); ++dstI; } ++srcI; } if (kEnableItemIteration) { sizeAndPackedBegin_.packedBegin() = ItemIter{dstChunk, dstI - 1}.pack(); } } else { // 1 byte per chunk means < 1 bit per value temporary overhead std::array stackBuf; uint8_t* fullness; if (newChunkCount <= stackBuf.size()) { fullness = stackBuf.data(); } else { ByteAlloc a{this->alloc()}; // may throw fullness = &*std::allocator_traits::allocate(a, newChunkCount); } std::memset(fullness, '\0', newChunkCount); SCOPE_EXIT { if (newChunkCount > stackBuf.size()) { ByteAlloc a{this->alloc()}; std::allocator_traits::deallocate( a, std::pointer_traits::pointer>::pointer_to(*fullness), newChunkCount); } }; auto srcChunk = origChunks + origChunkCount - 1; std::size_t remaining = origSize; while (remaining > 0) { auto iter = srcChunk->occupiedIter(); if (prefetchBeforeRehash()) { for (auto piter = iter; piter.hasNext();) { this->prefetchValue(srcChunk->item(piter.next())); } } while (iter.hasNext()) { --remaining; auto srcI = iter.next(); Item& srcItem = srcChunk->item(srcI); auto hp = splitHash( this->computeItemHash(const_cast(srcItem))); FOLLY_SAFE_CHECK(hp.second == srcChunk->tag(srcI), ""); auto dstIter = allocateTag(fullness, hp); this->moveItemDuringRehash(dstIter.itemAddr(), srcItem); } --srcChunk; } if (kEnableItemIteration) { // this code replaces size invocations of adjustSizeAndBeginAfterInsert std::size_t i = chunkMask_; while (fullness[i] == 0) { --i; } sizeAndPackedBegin_.packedBegin() = ItemIter{chunks_ + i, std::size_t{fullness[i]} - 1}.pack(); } } success = true; } // Randomization to help expose bugs when running tests in debug or // sanitizer builds FOLLY_ALWAYS_INLINE void debugModeOnReserve(std::size_t capacity) { if (kIsLibrarySanitizeAddress || kIsDebug) { if (capacity > size()) { tlsPendingSafeInserts(static_cast(capacity - size())); } } } void debugModeSpuriousRehash() { auto cc = chunkMask_ + 1; auto ss = chunks_->capacityScale(); rehashImpl(size(), cc, ss, cc, ss); } FOLLY_ALWAYS_INLINE void debugModeBeforeInsert() { // When running under ASAN, we add a spurious rehash with 1/size() // probability before every insert. This means that finding reference // stability problems for F14Value and F14Vector is much more likely. // The most common pattern that causes this is // // auto& ref = map[k1]; map[k2] = foo(ref); // // One way to fix this is to call map.reserve(N) before such a // sequence, where N is the number of keys that might be inserted // within the section that retains references plus the existing size. if (kIsLibrarySanitizeAddress && !tlsPendingSafeInserts() && size() > 0 && tlsMinstdRand(size()) == 0) { debugModeSpuriousRehash(); } } FOLLY_ALWAYS_INLINE void debugModeAfterInsert() { if (kIsLibrarySanitizeAddress || kIsDebug) { tlsPendingSafeInserts(-1); } } FOLLY_ALWAYS_INLINE void debugModePerturbSlotInsertOrder( ChunkPtr chunk, std::size_t& itemIndex) { FOLLY_SAFE_DCHECK(!chunk->occupied(itemIndex), ""); constexpr bool perturbSlot = FOLLY_F14_PERTURB_INSERTION_ORDER; if (perturbSlot && !tlsPendingSafeInserts()) { std::size_t e = chunkMask_ == 0 ? bucket_count() : Chunk::kCapacity; std::size_t i = itemIndex + tlsMinstdRand(e - itemIndex); if (!chunk->occupied(i)) { itemIndex = i; } } } public: // user has no control over max_load_factor void rehash(std::size_t capacity) { reserve(capacity); } void reserve(std::size_t capacity) { // We want to support the pattern // map.reserve(map.size() + 2); auto& r1 = map[k1]; auto& r2 = map[k2]; debugModeOnReserve(capacity); reserveImpl(capacity); } void reserveForInsert(size_t incoming = 1) { FOLLY_SAFE_DCHECK(incoming > 0, ""); auto needed = size() + incoming; auto chunkCount = chunkMask_ + 1; auto scale = chunks_->capacityScale(); auto existing = computeCapacity(chunkCount, scale); if (needed - 1 >= existing) { reserveForInsertImpl(needed - 1, chunkCount, scale, existing); } } // Returns pos,true if construct, pos,false if found. key is only used // during the search; all constructor args for an inserted value come // from args... key won't be accessed after args are touched. template std::pair tryEmplaceValue(K const& key, Args&&... args) { const auto hp = splitHash(this->computeKeyHash(key)); if (size() > 0) { auto existing = findImpl(hp, key); if (!existing.atEnd()) { return std::make_pair(existing, false); } } debugModeBeforeInsert(); reserveForInsert(); std::size_t index = hp.first; ChunkPtr chunk = chunks_ + (index & chunkMask_); auto firstEmpty = chunk->firstEmpty(); if (!firstEmpty.hasIndex()) { std::size_t delta = probeDelta(hp); do { chunk->incrOutboundOverflowCount(); index += delta; chunk = chunks_ + (index & chunkMask_); firstEmpty = chunk->firstEmpty(); } while (!firstEmpty.hasIndex()); chunk->adjustHostedOverflowCount(Chunk::kIncrHostedOverflowCount); } std::size_t itemIndex = firstEmpty.index(); debugModePerturbSlotInsertOrder(chunk, itemIndex); chunk->setTag(itemIndex, hp.second); ItemIter iter{chunk, itemIndex}; // insertAtBlank will clear the tag if the constructor throws insertAtBlank(iter, hp, std::forward(args)...); debugModeAfterInsert(); return std::make_pair(iter, true); } private: template void clearImpl() noexcept { if (chunks_ == Chunk::emptyInstance()) { FOLLY_SAFE_DCHECK(empty() && bucket_count() == 0, ""); return; } // turn clear into reset if the table is >= 16 chunks so that // we don't get too low a load factor bool willReset = Reset || chunkMask_ + 1 >= 16; auto origSize = size(); auto origCapacity = bucket_count(); if (willReset) { this->beforeReset(origSize, origCapacity); } else { this->beforeClear(origSize, origCapacity); } if (!empty()) { if (destroyItemOnClear()) { for (std::size_t ci = 0; ci <= chunkMask_; ++ci) { ChunkPtr chunk = chunks_ + ci; auto iter = chunk->occupiedIter(); if (prefetchBeforeDestroy()) { for (auto piter = iter; piter.hasNext();) { this->prefetchValue(chunk->item(piter.next())); } } while (iter.hasNext()) { this->destroyItem(chunk->item(iter.next())); } } } if (!willReset) { // It's okay to do this in a separate loop because we only do it // when the chunk count is small. That avoids a branch when we // are promoting a clear to a reset for a large table. auto scale = chunks_[0].capacityScale(); for (std::size_t ci = 0; ci <= chunkMask_; ++ci) { chunks_[ci].clear(); } chunks_[0].markEof(scale); } if (kEnableItemIteration) { sizeAndPackedBegin_.packedBegin() = ItemIter{}.pack(); } sizeAndPackedBegin_.size_ = 0; } if (willReset) { BytePtr rawAllocation = std::pointer_traits::pointer_to( *static_cast(static_cast(&*chunks_))); std::size_t rawSize = chunkAllocSize(chunkMask_ + 1, chunks_->capacityScale()); chunks_ = Chunk::emptyInstance(); chunkMask_ = 0; this->afterReset(origSize, origCapacity, rawAllocation, rawSize); } else { this->afterClear(origSize, origCapacity); } } void eraseImpl(ItemIter pos, HashPair hp) { this->destroyItem(pos.item()); adjustSizeAndBeginBeforeErase(pos); eraseBlank(pos, hp); } public: // The item needs to still be hashable during this call. If you want // to intercept the value before it is destroyed (to extract it, for // example), do so in the beforeDestroy callback. template void eraseIterInto(ItemIter pos, BeforeDestroy&& beforeDestroy) { HashPair hp{}; if (pos.chunk()->hostedOverflowCount() != 0) { hp = splitHash(this->computeItemHash(pos.citem())); } beforeDestroy(this->valueAtItemForExtract(pos.item())); eraseImpl(pos, hp); } template std::size_t eraseKeyInto(K const& key, BeforeDestroy&& beforeDestroy) { if (UNLIKELY(size() == 0)) { return 0; } auto hp = splitHash(this->computeKeyHash(key)); auto iter = findImpl(hp, key); if (!iter.atEnd()) { beforeDestroy(this->valueAtItemForExtract(iter.item())); eraseImpl(iter, hp); return 1; } else { return 0; } } void clear() noexcept { if (kIsLibrarySanitizeAddress) { // force recycling of heap memory auto bc = bucket_count(); reset(); try { reserveImpl(bc); } catch (std::bad_alloc const&) { // ASAN mode only, keep going } } else { clearImpl(); } } // Like clear(), but always frees all dynamic storage allocated // by the table. void reset() noexcept { clearImpl(); } // Get memory footprint, not including sizeof(*this). std::size_t getAllocatedMemorySize() const { std::size_t sum = 0; visitAllocationClasses( [&sum](std::size_t bytes, std::size_t n) { sum += bytes * n; }); return sum; } // Enumerates classes of allocated memory blocks currently owned // by this table, calling visitor(allocationSize, allocationCount). // This can be used to get a more accurate indication of memory footprint // than getAllocatedMemorySize() if you have some way of computing the // internal fragmentation of the allocator, such as JEMalloc's nallocx. // The visitor might be called twice with the same allocationSize. The // visitor's computation should produce the same result for visitor(8, // 2) as for two calls to visitor(8, 1), for example. The visitor may // be called with a zero allocationCount. template void visitAllocationClasses(V&& visitor) const { auto scale = chunks_->capacityScale(); this->visitPolicyAllocationClasses( scale == 0 ? 0 : chunkAllocSize(chunkMask_ + 1, scale), size(), bucket_count(), visitor); } // visitor should take an Item const& template void visitItems(V&& visitor) const { if (empty()) { return; } std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_; auto chunk = &chunks_[0]; for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) { auto iter = chunk->occupiedIter(); if (prefetchBeforeCopy()) { for (auto piter = iter; piter.hasNext();) { this->prefetchValue(chunk->citem(piter.next())); } } while (iter.hasNext()) { visitor(chunk->citem(iter.next())); } } } // visitor should take two Item const* template void visitContiguousItemRanges(V&& visitor) const { if (empty()) { return; } std::size_t maxChunkIndex = lastOccupiedChunk() - chunks_; auto chunk = &chunks_[0]; for (std::size_t i = 0; i <= maxChunkIndex; ++i, ++chunk) { for (auto iter = chunk->occupiedRangeIter(); iter.hasNext();) { auto be = iter.next(); FOLLY_SAFE_DCHECK( chunk->occupied(be.first) && chunk->occupied(be.second - 1), ""); Item const* b = chunk->itemAddr(be.first); visitor(b, b + (be.second - be.first)); } } } private: static std::size_t& histoAt( std::vector& histo, std::size_t index) { if (histo.size() <= index) { histo.resize(index + 1); } return histo.at(index); } public: // Expensive F14TableStats computeStats() const { F14TableStats stats; if (kIsDebug && kEnableItemIteration) { // validate iteration std::size_t n = 0; ItemIter prev; for (auto iter = begin(); iter != end(); iter.advance()) { FOLLY_SAFE_DCHECK(n == 0 || iter.pack() < prev.pack(), ""); ++n; prev = iter; } FOLLY_SAFE_DCHECK(n == size(), ""); } FOLLY_SAFE_DCHECK( (chunks_ == Chunk::emptyInstance()) == (bucket_count() == 0), ""); std::size_t n1 = 0; std::size_t n2 = 0; auto cc = bucket_count() == 0 ? 0 : chunkMask_ + 1; for (std::size_t ci = 0; ci < cc; ++ci) { ChunkPtr chunk = chunks_ + ci; FOLLY_SAFE_DCHECK(chunk->eof() == (ci == 0), ""); auto iter = chunk->occupiedIter(); std::size_t chunkOccupied = 0; for (auto piter = iter; piter.hasNext(); piter.next()) { ++chunkOccupied; } n1 += chunkOccupied; histoAt(stats.chunkOccupancyHisto, chunkOccupied)++; histoAt( stats.chunkOutboundOverflowHisto, chunk->outboundOverflowCount())++; histoAt(stats.chunkHostedOverflowHisto, chunk->hostedOverflowCount())++; while (iter.hasNext()) { auto ii = iter.next(); ++n2; { auto& item = chunk->citem(ii); auto hp = splitHash(this->computeItemHash(item)); FOLLY_SAFE_DCHECK(chunk->tag(ii) == hp.second, ""); std::size_t dist = 1; std::size_t index = hp.first; std::size_t delta = probeDelta(hp); while ((index & chunkMask_) != ci) { index += delta; ++dist; } histoAt(stats.keyProbeLengthHisto, dist)++; } // misses could have any tag, so we do the dumb but accurate // thing and just try them all for (std::size_t ti = 0; ti < 256; ++ti) { uint8_t tag = static_cast(ti == 0 ? 1 : 0); HashPair hp{ci, tag}; std::size_t dist = 1; std::size_t index = hp.first; std::size_t delta = probeDelta(hp); for (std::size_t tries = 0; tries <= chunkMask_ && chunks_[index & chunkMask_].outboundOverflowCount() != 0; ++tries) { index += delta; ++dist; } histoAt(stats.missProbeLengthHisto, dist)++; } } } FOLLY_SAFE_DCHECK(n1 == size(), ""); FOLLY_SAFE_DCHECK(n2 == size(), ""); stats.policy = pretty_name(); stats.size = size(); stats.valueSize = sizeof(value_type); stats.bucketCount = bucket_count(); stats.chunkCount = cc; stats.totalBytes = sizeof(*this) + getAllocatedMemorySize(); stats.overheadBytes = stats.totalBytes - size() * sizeof(value_type); return stats; } }; } // namespace detail } // namespace f14 #endif // FOLLY_F14_VECTOR_INTRINSICS_AVAILABLE namespace f14 { namespace test { inline void disableInsertOrderRandomization() { if (kIsLibrarySanitizeAddress || kIsDebug) { detail::tlsPendingSafeInserts(static_cast( (std::numeric_limits::max)() / 2)); } } } // namespace test } // namespace f14 } // namespace folly