Why is (or isn't?) SFENCE + LFENCE equivalent to MFENCE?

后端 未结 3 1980
我在风中等你
我在风中等你 2020-11-28 11:17

As we know from a previous answer to Does it make any sense instruction LFENCE in processors x86/x86_64? that we can not use SFENCE instead of MFENCE

3条回答
  •  眼角桃花
    2020-11-28 12:03

    In general MFENCE != SFENCE + LFENCE. For example the code below, when compiled with -DBROKEN, fails on some Westmere and Sandy Bridge systems but appears to work on Ryzen. In fact on AMD systems just an SFENCE seems to be sufficient.

    #include 
    #include 
    #include 
    #include 
    using namespace std;
    
    #define ITERATIONS (10000000)
    class minircu {
            public:
                    minircu() : rv_(0), wv_(0) {}
                    class lock_guard {
                            minircu& _r;
                            const std::size_t _id;
                            public:
                            lock_guard(minircu& r, std::size_t id) : _r(r), _id(id) { _r.rlock(_id); }
                            ~lock_guard() { _r.runlock(_id); }
                    };
                    void synchronize() {
                            wv_.store(-1, std::memory_order_seq_cst);
                            while(rv_.load(std::memory_order_relaxed) & wv_.load(std::memory_order_acquire));
                    }
            private:
                    void rlock(std::size_t id) {
                            rab_[id].store(1, std::memory_order_relaxed);
    #ifndef BROKEN
                            __asm__ __volatile__ ("mfence;" : : : "memory");
    #else
                            __asm__ __volatile__ ("sfence; lfence;" : : : "memory");
    #endif
                    }
                    void runlock(std::size_t id) {
                            rab_[id].store(0, std::memory_order_release);
                            wab_[id].store(0, std::memory_order_release);
                    }
                    union alignas(64) {
                            std::atomic           rv_;
                            std::atomic      rab_[8];
                    };
                    union alignas(8) {
                            std::atomic           wv_;
                            std::atomic      wab_[8];
                    };
    };
    
    minircu r;
    
    std::atomic shared_values[2];
    std::atomic*> pvalue(shared_values);
    std::atomic total(0);
    
    void r_thread(std::size_t id) {
        uint64_t subtotal = 0;
        for(size_t i = 0; i < ITERATIONS; ++i) {
                    minircu::lock_guard l(r, id);
                    subtotal += (*pvalue).load(memory_order_acquire);
        }
        total += subtotal;
    }
    
    void wr_thread() {
        for (size_t i = 1; i < (ITERATIONS/10); ++i) {
                    std::atomic* o = pvalue.load(memory_order_relaxed);
                    std::atomic* p = shared_values + i % 2;
                    p->store(1, memory_order_release);
                    pvalue.store(p, memory_order_release);
    
                    r.synchronize();
                    o->store(0, memory_order_relaxed); // should not be visible to readers
        }
    }
    
    int main(int argc, char* argv[]) {
        std::vector vec_thread;
        shared_values[0] = shared_values[1] = 1;
        std::size_t readers = (argc > 1) ? ::atoi(argv[1]) : 8;
        if (readers > 8) {
            std::cout << "maximum number of readers is " << 8 << std::endl; return 0;
        } else
            std::cout << readers << " readers" << std::endl;
    
        vec_thread.emplace_back( [=]() { wr_thread(); } );
        for(size_t i = 0; i < readers; ++i)
            vec_thread.emplace_back( [=]() { r_thread(i); } );
        for(auto &i: vec_thread) i.join();
    
        std::cout << "total = " << total << ", expecting " << readers * ITERATIONS << std::endl;
        return 0;
    }
    

提交回复
热议问题