/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #ifndef _MSC_VER #define _GNU_SOURCE 1 // for RTLD_NOLOAD #include #endif #include #include #include #include #include #include namespace folly { ///////////// CacheLocality /// Returns the CacheLocality information best for this machine static CacheLocality getSystemLocalityInfo() { if (kIsLinux) { try { return CacheLocality::readFromProcCpuinfo(); } catch (...) { // keep trying } } long numCpus = sysconf(_SC_NPROCESSORS_CONF); if (numCpus <= 0) { // This shouldn't happen, but if it does we should try to keep // going. We are probably not going to be able to parse /sys on // this box either (although we will try), which means we are going // to fall back to the SequentialThreadId splitter. On my 16 core // (x hyperthreading) dev box 16 stripes is enough to get pretty good // contention avoidance with SequentialThreadId, and there is little // improvement from going from 32 to 64. This default gives us some // wiggle room numCpus = 32; } return CacheLocality::uniform(size_t(numCpus)); } template <> const CacheLocality& CacheLocality::system() { static auto* cache = new CacheLocality(getSystemLocalityInfo()); return *cache; } // Each level of cache has sharing sets, which are the set of cpus // that share a common cache at that level. These are available in a // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map, // for example). They are also available in a human-readable list form, // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list // is a comma-separated list of numbers and ranges, where the ranges are // a pair of decimal numbers separated by a '-'. // // To sort the cpus for optimum locality we don't really need to parse // the sharing sets, we just need a unique representative from the // equivalence class. The smallest value works fine, and happens to be // the first decimal number in the file. We load all of the equivalence // class information from all of the cpu*/index* directories, order the // cpus first by increasing last-level cache equivalence class, then by // the smaller caches. Finally, we break ties with the cpu number itself. /// Returns the first decimal number in the string, or throws an exception /// if the string does not start with a number terminated by ',', '-', /// '\n', or eos. static size_t parseLeadingNumber(const std::string& line) { auto raw = line.c_str(); char* end; unsigned long val = strtoul(raw, &end, 10); if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) { throw std::runtime_error( to("error parsing list '", line, "'").c_str()); } return val; } CacheLocality CacheLocality::readFromSysfsTree( const std::function& mapping) { // number of equivalence classes per level std::vector numCachesByLevel; // the list of cache equivalence classes, where equivalance classes // are named by the smallest cpu in the class std::vector> equivClassesByCpu; std::vector cpus; while (true) { auto cpu = cpus.size(); std::vector levels; for (size_t index = 0;; ++index) { auto dir = sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index); auto cacheType = mapping(dir + "type"); auto equivStr = mapping(dir + "shared_cpu_list"); if (cacheType.empty() || equivStr.empty()) { // no more caches break; } if (cacheType[0] == 'I') { // cacheType in { "Data", "Instruction", "Unified" }. skip icache continue; } auto equiv = parseLeadingNumber(equivStr); auto level = levels.size(); levels.push_back(equiv); if (equiv == cpu) { // we only want to count the equiv classes once, so we do it when // we first encounter them while (numCachesByLevel.size() <= level) { numCachesByLevel.push_back(0); } numCachesByLevel[level]++; } } if (levels.empty()) { // no levels at all for this cpu, we must be done break; } equivClassesByCpu.emplace_back(std::move(levels)); cpus.push_back(cpu); } if (cpus.empty()) { throw std::runtime_error("unable to load cache sharing info"); } std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool { // sort first by equiv class of cache with highest index, // direction doesn't matter. If different cpus have // different numbers of caches then this code might produce // a sub-optimal ordering, but it won't crash auto& lhsEquiv = equivClassesByCpu[lhs]; auto& rhsEquiv = equivClassesByCpu[rhs]; for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1; i >= 0; --i) { auto idx = size_t(i); if (lhsEquiv[idx] != rhsEquiv[idx]) { return lhsEquiv[idx] < rhsEquiv[idx]; } } // break ties deterministically by cpu return lhs < rhs; }); // the cpus are now sorted by locality, with neighboring entries closer // to each other than entries that are far away. For striping we want // the inverse map, since we are starting with the cpu std::vector indexes(cpus.size()); for (size_t i = 0; i < cpus.size(); ++i) { indexes[cpus[i]] = i; } return CacheLocality{ cpus.size(), std::move(numCachesByLevel), std::move(indexes)}; } CacheLocality CacheLocality::readFromSysfs() { return readFromSysfsTree([](std::string name) { std::ifstream xi(name.c_str()); std::string rv; std::getline(xi, rv); return rv; }); } static bool procCpuinfoLineRelevant(std::string const& line) { return line.size() > 4 && (line[0] == 'p' || line[0] == 'c'); } CacheLocality CacheLocality::readFromProcCpuinfoLines( std::vector const& lines) { size_t physicalId = 0; size_t coreId = 0; std::vector> cpus; size_t maxCpu = 0; for (auto iter = lines.rbegin(); iter != lines.rend(); ++iter) { auto& line = *iter; if (!procCpuinfoLineRelevant(line)) { continue; } auto sepIndex = line.find(':'); if (sepIndex == std::string::npos || sepIndex + 2 > line.size()) { continue; } auto arg = line.substr(sepIndex + 2); // "physical id" is socket, which is the most important locality // context. "core id" is a real core, so two "processor" entries with // the same physical id and core id are hyperthreads of each other. // "processor" is the top line of each record, so when we hit it in // the reverse order then we can emit a record. if (line.find("physical id") == 0) { physicalId = parseLeadingNumber(arg); } else if (line.find("core id") == 0) { coreId = parseLeadingNumber(arg); } else if (line.find("processor") == 0) { auto cpu = parseLeadingNumber(arg); maxCpu = std::max(cpu, maxCpu); cpus.emplace_back(physicalId, coreId, cpu); } } if (cpus.empty()) { throw std::runtime_error("no CPUs parsed from /proc/cpuinfo"); } if (maxCpu != cpus.size() - 1) { throw std::runtime_error( "offline CPUs not supported for /proc/cpuinfo cache locality source"); } std::sort(cpus.begin(), cpus.end()); size_t cpusPerCore = 1; while (cpusPerCore < cpus.size() && std::get<0>(cpus[cpusPerCore]) == std::get<0>(cpus[0]) && std::get<1>(cpus[cpusPerCore]) == std::get<1>(cpus[0])) { ++cpusPerCore; } // we can't tell the real cache hierarchy from /proc/cpuinfo, but it // works well enough to assume there are 3 levels, L1 and L2 per-core // and L3 per socket std::vector numCachesByLevel; numCachesByLevel.push_back(cpus.size() / cpusPerCore); numCachesByLevel.push_back(cpus.size() / cpusPerCore); numCachesByLevel.push_back(std::get<0>(cpus.back()) + 1); std::vector indexes(cpus.size()); for (size_t i = 0; i < cpus.size(); ++i) { indexes[std::get<2>(cpus[i])] = i; } return CacheLocality{ cpus.size(), std::move(numCachesByLevel), std::move(indexes)}; } CacheLocality CacheLocality::readFromProcCpuinfo() { std::vector lines; { std::ifstream xi("/proc/cpuinfo"); if (xi.fail()) { throw std::runtime_error("unable to open /proc/cpuinfo"); } char buf[8192]; while (xi.good() && lines.size() < 20000) { xi.getline(buf, sizeof(buf)); std::string str(buf); if (procCpuinfoLineRelevant(str)) { lines.emplace_back(std::move(str)); } } } return readFromProcCpuinfoLines(lines); } CacheLocality CacheLocality::uniform(size_t numCpus) { CacheLocality rv; rv.numCpus = numCpus; // one cache shared by all cpus rv.numCachesByLevel.push_back(numCpus); // no permutations in locality index mapping for (size_t cpu = 0; cpu < numCpus; ++cpu) { rv.localityIndexByCpu.push_back(cpu); } return rv; } ////////////// Getcpu Getcpu::Func Getcpu::resolveVdsoFunc() { #if !defined(FOLLY_HAVE_LINUX_VDSO) || defined(FOLLY_SANITIZE_MEMORY) return nullptr; #else void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (h == nullptr) { return nullptr; } auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu")); if (func == nullptr) { // technically a null result could either be a failure or a successful // lookup of a symbol with the null value, but the second can't actually // happen for this symbol. No point holding the handle forever if // we don't need the code dlclose(h); } return func; #endif } #ifdef FOLLY_CL_USE_FOLLY_TLS /////////////// SequentialThreadId template struct SequentialThreadId; #endif /////////////// AccessSpreader template struct AccessSpreader; SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz) : allocSize_{allocSize}, sz_(sz) {} SimpleAllocator::~SimpleAllocator() { std::lock_guard g(m_); for (auto& block : blocks_) { folly::aligned_free(block); } } void* SimpleAllocator::allocateHard() { // Allocate a new slab. mem_ = static_cast(folly::aligned_malloc(allocSize_, allocSize_)); if (!mem_) { throw_exception(); } end_ = mem_ + allocSize_; blocks_.push_back(mem_); // Install a pointer to ourselves as the allocator. *reinterpret_cast(mem_) = this; static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small"); mem_ += std::min(sz_, max_align_v); // New allocation. auto mem = mem_; mem_ += sz_; assert(intptr_t(mem) % 128 != 0); return mem; } } // namespace folly