
  伪共享(False Sharing)是并发编程中的一种性能问题,它发生在多个线程并发地访问并修改同一缓存行中的不同数据时。


  计算机内存是按块(通常称为缓存行,cache line)的形式存储和传输的,而不是单个字节或单个变量。这是因为一次性从内存中获取一大块数据(例如64字节)比获取单个字节的效率更高。一旦某个线程需要访问内存中的某个地址,整个缓存行(包含该地址的那一块)就会被加载到该线程的CPU缓存中。



    3.使用线程本地存储(Thread Local Storage):确保每个线程都有自己的数据副本,从而避免伪共享。



  方法1和方法2从本质上来说都是对数据进行填充,尽可能避免共享的数据在同一个缓存行中,从而减少伪共享的可能性。两者都是典型的空间换时间,只不过在实现上略有区别。 这里参考CppReference中的示例代码,实现一种数据填充的方法并进行有效性验证,代码如下:

#include <atomic>
#include <chrono>
#include <cstddef>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <new>
#include <thread>
using namespace std;
using namespace chrono;
std::mutex cout_mutex;
// 最大缓存行字节数
constexpr int MAX_CACHE_LINE_SIZE = 64;
// 无填充
struct AtomWrapper {
    std::atomic_uint64_t value;

    AtomWrapper() : value(0) {}
    AtomWrapper(uint64_t v) : value(v) {}

    AtomWrapper(const AtomWrapper &other) = delete;
    AtomWrapper &operator=(const AtomWrapper &other) = delete;
struct AtomWrapperWithPadding {
    std::atomic_uint64_t value;

    AtomWrapperWithPadding() : value(0) {}
    AtomWrapperWithPadding(uint64_t v) : value(v) {}

    AtomWrapperWithPadding(const AtomWrapperWithPadding &other) = delete;
    AtomWrapperWithPadding &operator=(const AtomWrapperWithPadding &other) = delete;
    // padding to max cache line size, 64byte for x86
    unsigned char padding[MAX_CACHE_LINE_SIZE - sizeof(std::atomic_uint64_t)] = {};

constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning

struct alignas(MAX_CACHE_LINE_SIZE)
OneCacheLiner        // occupies one cache line
    AtomWrapper x{};
    AtomWrapper y{};

struct TwoCacheLiner // occupies two cache lines
    AtomWrapperWithPadding x{};
    AtomWrapperWithPadding y{};

inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }

template<bool xy>
void oneCacheLinerThread()
    const auto start { now() };

    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            oneCacheLiner.x.value.fetch_add(1, std::memory_order_relaxed);
            oneCacheLiner.y.value.fetch_add(1, std::memory_order_relaxed);

    const std::chrono::duration<double, std::milli> elapsed { now() - start };
    std::lock_guard lk{cout_mutex};
    std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        oneCacheLiner.x.value = elapsed.count();
        oneCacheLiner.y.value = elapsed.count();

template<bool xy>
void twoCacheLinerThread()
    const auto start { now() };

    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            twoCacheLiner.x.value.fetch_add(1, std::memory_order_relaxed);
            twoCacheLiner.y.value.fetch_add(1, std::memory_order_relaxed);

    const std::chrono::duration<double, std::milli> elapsed { now() - start };
    std::lock_guard lk{cout_mutex};
    std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        twoCacheLiner.x.value = elapsed.count();
        twoCacheLiner.y.value = elapsed.count();

int main(void)
    constexpr int max_runs{4};

    int oneCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
        std::thread th1{oneCacheLinerThread<0>};
        std::thread th2{oneCacheLinerThread<1>};
        th1.join(); th2.join();
        oneCacheLiner_average += oneCacheLiner.x.value + oneCacheLiner.y.value;
    std::cout << "Average T1 time: "
              << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";

    int twoCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
        std::thread th1{twoCacheLinerThread<0>};
        std::thread th2{twoCacheLinerThread<1>};
        th1.join(); th2.join();
        twoCacheLiner_average += twoCacheLiner.x.value + twoCacheLiner.y.value;
    std::cout << "Average T2 time: "
              << (twoCacheLiner_average / max_runs / 2) << " ms\n\n"
              << "Ratio T1/T2:~ "
              << 1.0 * oneCacheLiner_average / twoCacheLiner_average << '\n';

    return 0;


oneCacheLinerThread() spent 229.381 ms
oneCacheLinerThread() spent 229.439 ms
oneCacheLinerThread() spent 206.727 ms
oneCacheLinerThread() spent 208.405 ms
oneCacheLinerThread() spent 215.631 ms
oneCacheLinerThread() spent 217.117 ms
oneCacheLinerThread() spent 215.271 ms
oneCacheLinerThread() spent 216.422 ms
Average T1 time: 216 ms

twoCacheLinerThread() spent 18.553 ms
twoCacheLinerThread() spent 19.2109 ms
twoCacheLinerThread() spent 18.489 ms
twoCacheLinerThread() spent 18.965 ms
twoCacheLinerThread() spent 18.6716 ms
twoCacheLinerThread() spent 19.2982 ms
twoCacheLinerThread() spent 18.9782 ms
twoCacheLinerThread() spent 19.0659 ms
Average T2 time: 18 ms

Ratio T1/T2:~ 11.8027



#include <atomic>
#include <chrono>
#include <cstddef>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <new>
#include <thread>

#ifdef __cpp_lib_hardware_interference_size
    using std::hardware_constructive_interference_size;
    using std::hardware_destructive_interference_size;
    // 64 bytes on x86-64 │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ __cacheline_aligned │ ...
    constexpr std::size_t hardware_constructive_interference_size = 64;
    constexpr std::size_t hardware_destructive_interference_size = 64;

std::mutex cout_mutex;

constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning

struct alignas(hardware_constructive_interference_size)
OneCacheLiner // occupies one cache line
    std::atomic_uint64_t x{};
    std::atomic_uint64_t y{};

struct TwoCacheLiner // occupies two cache lines
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
    alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};

inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }

template<bool xy>
void oneCacheLinerThread()
    const auto start { now() };

    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
            oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);

    const std::chrono::duration<double, std::milli> elapsed { now() - start };
    std::lock_guard lk{cout_mutex};
    std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        oneCacheLiner.x = elapsed.count();
        oneCacheLiner.y = elapsed.count();

template<bool xy>
void twoCacheLinerThread()
    const auto start { now() };

    for (uint64_t count{}; count != max_write_iterations; ++count)
        if constexpr (xy)
            twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
            twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);

    const std::chrono::duration<double, std::milli> elapsed { now() - start };
    std::lock_guard lk{cout_mutex};
    std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
    if constexpr (xy)
        twoCacheLiner.x = elapsed.count();
        twoCacheLiner.y = elapsed.count();

int main()
    std::cout << "__cpp_lib_hardware_interference_size "
#   ifdef __cpp_lib_hardware_interference_size
        "= " << __cpp_lib_hardware_interference_size << '\n';
#   else
        "is not defined, use " << hardware_destructive_interference_size
                               << " as fallback\n";
#   endif

    std::cout << "hardware_destructive_interference_size == "
              << hardware_destructive_interference_size << '\n'
              << "hardware_constructive_interference_size == "
              << hardware_constructive_interference_size << "\n\n"
              << std::fixed << std::setprecision(2)
              << "sizeof( OneCacheLiner ) == " << sizeof( OneCacheLiner ) << '\n'
              << "sizeof( TwoCacheLiner ) == " << sizeof( TwoCacheLiner ) << "\n\n";

    constexpr int max_runs{4};

    int oneCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
        std::thread th1{oneCacheLinerThread<0>};
        std::thread th2{oneCacheLinerThread<1>};
        th1.join(); th2.join();
        oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
    std::cout << "Average T1 time: "
              << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";

    int twoCacheLiner_average{0};
    for (auto i{0}; i != max_runs; ++i)
        std::thread th1{twoCacheLinerThread<0>};
        std::thread th2{twoCacheLinerThread<1>};
        th1.join(); th2.join();
        twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
    std::cout << "Average T2 time: "
              << (twoCacheLiner_average / max_runs / 2) << " ms\n\n"
              << "Ratio T1/T2:~ "
              << 1.0 * oneCacheLiner_average / twoCacheLiner_average << '\n';


__cpp_lib_hardware_interference_size is not defined, use 64 as fallback
hardware_destructive_interference_size == 64
hardware_constructive_interference_size == 64

sizeof( OneCacheLiner ) == 64
sizeof( TwoCacheLiner ) == 128

oneCacheLinerThread() spent 195.18 ms
oneCacheLinerThread() spent 198.17 ms
oneCacheLinerThread() spent 225.23 ms
oneCacheLinerThread() spent 227.18 ms
oneCacheLinerThread() spent 223.31 ms
oneCacheLinerThread() spent 223.41 ms
oneCacheLinerThread() spent 215.79 ms
oneCacheLinerThread() spent 217.10 ms
Average T1 time: 215 ms

twoCacheLinerThread() spent 19.21 ms
twoCacheLinerThread() spent 19.18 ms
twoCacheLinerThread() spent 19.41 ms
twoCacheLinerThread() spent 19.69 ms
twoCacheLinerThread() spent 19.39 ms
twoCacheLinerThread() spent 19.47 ms
twoCacheLinerThread() spent 19.44 ms
twoCacheLinerThread() spent 20.32 ms
Average T2 time: 19 ms

Ratio T1/T2:~ 11.26


