/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if !FOLLY_MOBILE && defined(FOLLY_TLS) #define FOLLY_CL_USE_FOLLY_TLS 1 #else #undef FOLLY_CL_USE_FOLLY_TLS #endif namespace folly { // This file contains several classes that might be useful if you are // trying to dynamically optimize cache locality: CacheLocality reads // cache sharing information from sysfs to determine how CPUs should be // grouped to minimize contention, Getcpu provides fast access to the // current CPU via __vdso_getcpu, and AccessSpreader uses these two to // optimally spread accesses among a predetermined number of stripes. // // AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is // substantially less than the cost of a cache miss. This means that we // can effectively use it to reduce cache line ping-pong on striped data // structures such as IndexedMemPool or statistics counters. // // Because CacheLocality looks at all of the cache levels, it can be // used for different levels of optimization. AccessSpreader(2) does // per-chip spreading on a dual socket system. AccessSpreader(numCpus) // does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does // perfect L1 spreading in a system with hyperthreading enabled. struct CacheLocality { /// 1 more than the maximum value that can be returned from sched_getcpu /// or getcpu. This is the number of hardware thread contexts provided /// by the processors size_t numCpus; /// Holds the number of caches present at each cache level (0 is /// the closest to the cpu). This is the number of AccessSpreader /// stripes needed to avoid cross-cache communication at the specified /// layer. numCachesByLevel.front() is the number of L1 caches and /// numCachesByLevel.back() is the number of last-level caches. std::vector numCachesByLevel; /// A map from cpu (from sched_getcpu or getcpu) to an index in the /// range 0..numCpus-1, where neighboring locality indices are more /// likely to share caches then indices far away. All of the members /// of a particular cache level be contiguous in their locality index. /// For example, if numCpus is 32 and numCachesByLevel.back() is 2, /// then cpus with a locality index < 16 will share one last-level /// cache and cpus with a locality index >= 16 will share the other. std::vector localityIndexByCpu; /// Returns the best CacheLocality information available for the current /// system, cached for fast access. This will be loaded from sysfs if /// possible, otherwise it will be correct in the number of CPUs but /// not in their sharing structure. /// /// If you are into yo dawgs, this is a shared cache of the local /// locality of the shared caches. /// /// The template parameter here is used to allow injection of a /// repeatable CacheLocality structure during testing. Rather than /// inject the type of the CacheLocality provider into every data type /// that transitively uses it, all components select between the default /// sysfs implementation and a deterministic implementation by keying /// off the type of the underlying atomic. See DeterministicScheduler. template