I have written a code that randomly generates two matrices from dimensions 2x2 up to 50x50. I am then recording the time it takes for each matrix multiplication from dimensi
In my first tests, I started with 2×2 matrices, and doubled the number of rows and columns for each test series ending with 64×64 matrices.
I soon came to the same conclusion as Mike: these matrices are much too small. The overhead for setting up and joining threads consumes any speed-up which might've been gained by concurrency. So, I modified the test series starting with 64×64 matrices and ending with 512×512.
I did the same with VS2013 (release mode) and got similar results.
A speed-up of 3 sounds not that bad (ignoring the fact that it's still far away from 8 which you might expect as ideal for a H/W concurrency of 8).
While fiddling with the matrix multiplication, I got an idea for optimization which I wanted to check as well – even beyond multi-threading. It is the attempt to improve cache-locality.
For this, I transpose the 2nd matrix before multiplication. For the multiplication, a modified version of dot() (dotT()) is used which considers the transposition of 2nd matrix respectively.
I modified the above sample code respectively and got test-single-threading-matrix-transpose.cc:
#include
#include
#include
#include
#include
#include
#include
#include
#include
template
class MatrixT {
public:
typedef VALUE Value;
private:
size_t _nRows, _nCols;
std::vector _values;
public:
MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
_nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
{ }
~MatrixT() = default;
size_t getNumCols() const { return _nCols; }
size_t getNumRows() const { return _nRows; }
Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};
template
VALUE dot(const MatrixT &mat1, size_t iRow, const MatrixT &mat2, size_t iCol)
{
const size_t n = mat1.getNumCols();
assert(n == mat2.getNumRows());
VALUE sum = (VALUE)0;
for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
return sum;
}
template
MatrixT transpose(const MatrixT mat)
{
MatrixT matT(mat.getNumCols(), mat.getNumRows());
for (size_t i = 0; i < mat.getNumRows(); ++i) {
for (size_t j = 0; j < mat.getNumCols(); ++j) {
matT[j][i] = mat[i][j];
}
}
return matT;
}
template
VALUE dotT(const MatrixT &mat1, size_t iRow1, const MatrixT &matT2, size_t iRow2)
{
const size_t n = mat1.getNumCols();
assert(n == matT2.getNumCols());
VALUE sum = (VALUE)0;
for (size_t i = 0; i < n; ++i) sum += mat1[iRow1][i] * matT2[iRow2][i];
return sum;
}
typedef MatrixT Matrix;
typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast(Clock::now() - Clock::now())) Time;
Time duration(const Clock::time_point &t0)
{
return std::chrono::duration_cast(Clock::now() - t0);
}
Matrix populate(size_t dim)
{
Matrix mat(dim, dim);
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
}
}
return mat;
}
std::vector makeTest(size_t dim)
{
// make a test sample
const Matrix sampleA = populate(dim);
const Matrix sampleB = populate(dim);
// prepare result vectors
Matrix results2[2] = {
Matrix(dim, dim),
Matrix(dim, dim)
};
// make test
std::vector times{
[&]() { // single threading
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results2[0];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment single-threaded
for (size_t k = 0, n = dim * dim; k < n; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dot(a, i, b, j);
}
// done
return duration(t0);
}(),
[&]() { // single threading - with transposed matrix
// make a copy of test sample
const Matrix a(sampleA), b(sampleB);
Matrix &results = results2[1];
// remember start time
const Clock::time_point t0 = Clock::now();
const Matrix bT = transpose(b);
// do experiment single-threaded with transposed B
for (size_t k = 0, n = dim * dim; k < n; ++k) {
const size_t i = k / dim, j = k % dim;
results[i][j] = dotT(a, i, bT, j);
}
// done
return duration(t0);
}()
};
// check results (must be equal for any kind of computation)
const unsigned nResults = sizeof results2 / sizeof *results2;
for (unsigned iResult = 1; iResult < nResults; ++iResult) {
size_t nErrors = 0;
for (size_t i = 0; i < dim; ++i) {
for (size_t j = 0; j < dim; ++j) {
if (results2[0][i][j] != results2[iResult][i][j]) {
++nErrors;
#if 0 // def _DEBUG
std::cerr
<< "results2[0][" << i << "][" << j << "]: "
<< results2[0][i][j]
<< " != results2[" << iResult << "][" << i << "][" << j << "]: "
<< results2[iResult][i][j]
<< "!\n";
#endif // _DEBUG
}
}
}
if (nErrors) std::cerr << nErrors << " errors in results2[" << iResult << "]!\n";
}
// done
return times;
}
int main()
{
// heat up
std::cout << "Heat up...\n";
for (unsigned i = 0; i < 10; ++i) makeTest(64);
// perform tests:
const unsigned NTrials = 10;
for (size_t dim = 64; dim <= 512; dim *= 2) {
std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
// repeat NTrials times
std::cout << "Measuring " << NTrials << " runs...\n"
<< " "
<< " | " << std::setw(10) << "A * B"
<< " | " << std::setw(10) << "A *T B^T"
<< '\n';
std::vector sumTimes;
for (unsigned i = 0; i < NTrials; ++i) {
std::vector times = makeTest(dim);
std::cout << std::setw(2) << (i + 1) << ".";
for (const Time &time : times) {
std::cout << " | " << std::setw(10) << time.count();
}
std::cout << '\n';
sumTimes.resize(times.size(), 0.0);
for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
}
std::cout << "Average Values:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(1)
<< sumTime / NTrials;
}
std::cout << '\n';
std::cout << "Ratio:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(3)
<< sumTime / sumTimes.front();
}
std::cout << "\n\n";
}
// done
return 0;
}
I compiled and run again on cygwin64 (on Windows 10):
It achieves similar speed-ups like the above multi-threading attempts (I mean the better ones) but with a single core.
The additional effort for transposing the 2nd matrix (which is considered in measurement) does more than amortize. This isn't that surprising because there a so many more read-accesses in multiplication (which now access consecutive bytes) compared to the additional effort to construct/write the transposed matrix once.