/* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace folly { namespace detail { AtomicStruct MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5)); void MemoryIdler::flushLocalMallocCaches() { if (!usingJEMalloc()) { return; } if (!mallctl || !mallctlnametomib || !mallctlbymib) { FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl* weak link failed"; return; } try { // Not using mallctlCall as this will fail if tcache is disabled. mallctl("thread.tcache.flush", nullptr, nullptr, nullptr, 0); // By default jemalloc has 4 arenas per cpu, and then assigns each // thread to one of those arenas. This means that in any service // that doesn't perform a lot of context switching, the chances that // another thread will be using the current thread's arena (and hence // doing the appropriate dirty-page purging) are low. Some good // tuned configurations (such as that used by hhvm) use fewer arenas // and then pin threads to avoid contended access. In that case, // purging the arenas is counter-productive. We use the heuristic // that if narenas <= 2 * num_cpus then we shouldn't do anything here, // which detects when the narenas has been reduced from the default unsigned narenas; unsigned arenaForCurrent; size_t mib[3]; size_t miblen = 3; mallctlRead("opt.narenas", &narenas); mallctlRead("thread.arena", &arenaForCurrent); if (narenas > 2 * CacheLocality::system().numCpus && mallctlnametomib("arena.0.purge", mib, &miblen) == 0) { mib[1] = static_cast(arenaForCurrent); mallctlbymib(mib, miblen, nullptr, nullptr, nullptr, 0); } } catch (const std::runtime_error& ex) { FB_LOG_EVERY_MS(WARNING, 10000) << ex.what(); } } // Stack madvise isn't Linux or glibc specific, but the system calls // and arithmetic (and bug compatibility) are not portable. The set of // platforms could be increased if it was useful. #if (FOLLY_X64 || FOLLY_PPC64) && defined(_GNU_SOURCE) && \ defined(__linux__) && !FOLLY_MOBILE && !FOLLY_SANITIZE_ADDRESS static FOLLY_TLS uintptr_t tls_stackLimit; static FOLLY_TLS size_t tls_stackSize; static size_t pageSize() { static const size_t s_pageSize = sysconf(_SC_PAGESIZE); return s_pageSize; } static void fetchStackLimits() { int err; pthread_attr_t attr; if ((err = pthread_getattr_np(pthread_self(), &attr))) { // some restricted environments can't access /proc static folly::once_flag flag; folly::call_once(flag, [err]() { LOG(WARNING) << "pthread_getaddr_np failed errno=" << err; }); tls_stackSize = 1; return; } SCOPE_EXIT { pthread_attr_destroy(&attr); }; void* addr; size_t rawSize; if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) { // unexpected, but it is better to continue in prod than do nothing FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err; assert(false); tls_stackSize = 1; return; } if (rawSize >= (1ULL << 32)) { // Avoid unmapping huge swaths of memory if there is an insane // stack size. The boundary of sanity is somewhat arbitrary: 4GB. // // If we went into /proc to find the actual contiguous mapped pages // before unmapping we wouldn't care about the stack size at all, // but our current strategy is to unmap the entire range that might // be used for the stack even if it hasn't been fully faulted-in. // // Very large stack size is a bug (hence the assert), but we can // carry on if we are in prod. FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack returned insane stack size " << rawSize; assert(false); tls_stackSize = 1; return; } assert(addr != nullptr); assert(rawSize >= PTHREAD_STACK_MIN); // glibc subtracts guard page from stack size, even though pthread docs // seem to imply the opposite size_t guardSize; if (pthread_attr_getguardsize(&attr, &guardSize) != 0) { guardSize = 0; } assert(rawSize > guardSize); // stack goes down, so guard page adds to the base addr tls_stackLimit = reinterpret_cast(addr) + guardSize; tls_stackSize = rawSize - guardSize; assert((tls_stackLimit & (pageSize() - 1)) == 0); } FOLLY_NOINLINE static uintptr_t getStackPtr() { char marker; auto rv = reinterpret_cast(&marker); return rv; } void MemoryIdler::unmapUnusedStack(size_t retain) { if (tls_stackSize == 0) { fetchStackLimits(); } if (tls_stackSize <= std::max(static_cast(1), retain)) { // covers both missing stack info, and impossibly large retain return; } auto sp = getStackPtr(); assert(sp >= tls_stackLimit); assert(sp - tls_stackLimit < tls_stackSize); auto end = (sp - retain) & ~(pageSize() - 1); if (end <= tls_stackLimit) { // no pages are eligible for unmapping return; } size_t len = end - tls_stackLimit; assert((len & (pageSize() - 1)) == 0); if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) { // It is likely that the stack vma hasn't been fully grown. In this // case madvise will apply dontneed to the present vmas, then return // errno of ENOMEM. // If thread stack pages are backed by locked or huge pages, madvise will // fail with EINVAL. (EINVAL may also be returned if the address or length // are bad.) Warn in debug mode, since MemoryIdler may not function as // expected. // We can also get an EAGAIN, theoretically. PLOG_IF(WARNING, kIsDebug && errno == EINVAL) << "madvise failed"; assert(errno == EAGAIN || errno == ENOMEM || errno == EINVAL); } } #else void MemoryIdler::unmapUnusedStack(size_t /* retain */) {} #endif } // namespace detail } // namespace folly