As we know from a previous answer to Does it make any sense instruction LFENCE in processors x86/x86_64? that we can not use SFENCE instead of MFENCE
In general MFENCE != SFENCE + LFENCE. For example the code below, when compiled with -DBROKEN, fails on some Westmere and Sandy Bridge systems but appears to work on Ryzen. In fact on AMD systems just an SFENCE seems to be sufficient.
#include
#include
#include
#include
using namespace std;
#define ITERATIONS (10000000)
class minircu {
public:
minircu() : rv_(0), wv_(0) {}
class lock_guard {
minircu& _r;
const std::size_t _id;
public:
lock_guard(minircu& r, std::size_t id) : _r(r), _id(id) { _r.rlock(_id); }
~lock_guard() { _r.runlock(_id); }
};
void synchronize() {
wv_.store(-1, std::memory_order_seq_cst);
while(rv_.load(std::memory_order_relaxed) & wv_.load(std::memory_order_acquire));
}
private:
void rlock(std::size_t id) {
rab_[id].store(1, std::memory_order_relaxed);
#ifndef BROKEN
__asm__ __volatile__ ("mfence;" : : : "memory");
#else
__asm__ __volatile__ ("sfence; lfence;" : : : "memory");
#endif
}
void runlock(std::size_t id) {
rab_[id].store(0, std::memory_order_release);
wab_[id].store(0, std::memory_order_release);
}
union alignas(64) {
std::atomic rv_;
std::atomic rab_[8];
};
union alignas(8) {
std::atomic wv_;
std::atomic wab_[8];
};
};
minircu r;
std::atomic shared_values[2];
std::atomic*> pvalue(shared_values);
std::atomic total(0);
void r_thread(std::size_t id) {
uint64_t subtotal = 0;
for(size_t i = 0; i < ITERATIONS; ++i) {
minircu::lock_guard l(r, id);
subtotal += (*pvalue).load(memory_order_acquire);
}
total += subtotal;
}
void wr_thread() {
for (size_t i = 1; i < (ITERATIONS/10); ++i) {
std::atomic* o = pvalue.load(memory_order_relaxed);
std::atomic* p = shared_values + i % 2;
p->store(1, memory_order_release);
pvalue.store(p, memory_order_release);
r.synchronize();
o->store(0, memory_order_relaxed); // should not be visible to readers
}
}
int main(int argc, char* argv[]) {
std::vector vec_thread;
shared_values[0] = shared_values[1] = 1;
std::size_t readers = (argc > 1) ? ::atoi(argv[1]) : 8;
if (readers > 8) {
std::cout << "maximum number of readers is " << 8 << std::endl; return 0;
} else
std::cout << readers << " readers" << std::endl;
vec_thread.emplace_back( [=]() { wr_thread(); } );
for(size_t i = 0; i < readers; ++i)
vec_thread.emplace_back( [=]() { r_thread(i); } );
for(auto &i: vec_thread) i.join();
std::cout << "total = " << total << ", expecting " << readers * ITERATIONS << std::endl;
return 0;
}