Multithreading computation of Mandelbrot set

问题

I have created a program which creates a Mandelbrot set. Now I'm trying to make it multithreaded.

// mandelbrot.cpp
// compile with: g++ -std=c++11 mandelbrot.cpp -o mandelbrot
// view output with: eog mandelbrot.ppm

#include <fstream>
#include <complex> // if you make use of complex number facilities in C++
#include <iostream>
#include <cstdlib>
#include <thread>
#include <mutex>
#include <vector>


using namespace std;

template <class T> struct RGB { T r, g, b; };

template <class T>
class Matrix {
public:
Matrix(const size_t rows, const size_t cols) : _rows(rows), _cols(cols) {
    _matrix = new T*[rows];
    for (size_t i = 0; i < rows; ++i) {
        _matrix[i] = new T[cols];
    }
}
Matrix(const Matrix &m) : _rows(m._rows), _cols(m._cols) {
    _matrix = new T*[m._rows];
    for (size_t i = 0; i < m._rows; ++i) {
        _matrix[i] = new T[m._cols];
        for (size_t j = 0; j < m._cols; ++j) {
            _matrix[i][j] = m._matrix[i][j];
        }
    }
}
~Matrix() {
    for (size_t i = 0; i < _rows; ++i) {
        delete [] _matrix[i];
    }
    delete [] _matrix;
}
T *operator[] (const size_t nIndex)
{
    return _matrix[nIndex];
}
size_t width() const { return _cols; }
size_t height() const { return _rows; }
protected:
size_t _rows, _cols;
T **_matrix;
};

// Portable PixMap image
class PPMImage : public Matrix<RGB<unsigned char> >
{
public:
   unsigned int size; 

PPMImage(const size_t height, const size_t width) : Matrix(height, width) { }
void save(const std::string &filename)
{
    std::ofstream out(filename, std::ios_base::binary);
    out <<"P6" << std::endl << _cols << " " << _rows << std::endl << 255 << std::endl;
    for (size_t y=0; y<_rows; y++)
        for (size_t x=0; x<_cols; x++) 
            out << _matrix[y][x].r << _matrix[y][x].g << _matrix[y][x].b;
}    
};

/*Draw mandelbrot according to the provided parameters*/
void draw_Mandelbrot(PPMImage & image, const unsigned width, const unsigned height, double cxmin, double cxmax, double cymin, double cymax,unsigned int max_iterations)                         
{

for (std::size_t ix = 0; ix < width; ++ix)
    for (std::size_t iy = 0; iy < height; ++iy)
    {
        std::complex<double> c(cxmin + ix / (width - 1.0)*(cxmax - cxmin), cymin + iy / (height - 1.0)*(cymax - cymin));
        std::complex<double> z = 0;
        unsigned int iterations;

        for (iterations = 0; iterations < max_iterations && std::abs(z) < 2.0; ++iterations)
            z = z*z + c;

        image[iy][ix].r = image[iy][ix].g = image[iy][ix].b = iterations;

    }
}

int main()
{
const unsigned width = 1600;
const unsigned height = 1600;

PPMImage image(height, width);


int parts = 8;

std::vector<int>bnd (parts, image.size);

std::thread *tt = new std::thread[parts - 1];

time_t start, end;
time(&start);
//Lauch parts-1 threads
for (int i = 0; i < parts - 1; ++i) {
    tt[i] = std::thread(draw_Mandelbrot,ref(image), width, height, -2.0, 0.5, -1.0, 1.0, 10);
}

//Use the main thread to do part of the work !!!
for (int i = parts - 1; i < parts; ++i) {
    draw_Mandelbrot(ref(image), width, height, -2.0, 0.5, -1.0, 1.0, 10);
}

//Join parts-1 threads
for (int i = 0; i < parts - 1; ++i)
    tt[i].join();

time(&end);
std::cout << difftime(end, start) << " seconds" << std::endl;


image.save("mandelbrot.ppm");

delete[] tt;

return 0;
}

Now every thread draws the complete fractal (look in main()). How can I let the threads draw different parts of the fractal?

回答1:

You're making this (quite a lot) harder than it needs to be. This is the sort of task to which OpenMP is almost perfectly suited. For this task it gives almost perfect scaling with a bare minimum of effort.

I modified your draw_mandelbrot by inserting a pragma before the outer for loop:

#pragma omp parallel for
for (int ix = 0; ix < width; ++ix)
    for (int iy = 0; iy < height; ++iy)

Then I simplified your main down to:

int main() {
    const unsigned width = 1600;
    const unsigned height = 1600;

    PPMImage image(height, width);

    clock_t start = clock();
    draw_Mandelbrot(image, width, height, -2.0, 0.5, -1.0, 1.0, 10);
    clock_t stop = clock();

    std::cout << (double(stop - start) / CLOCKS_PER_SEC) << " seconds\n";

    image.save("mandelbrot.ppm");

    return 0;
}

On my (fairly slow) machine, your original code ran in 4.73 seconds. My modified code ran in 1.38 seconds. That's an improvement of 3.4x out of code that's nearly indistinguishable from a trivial single-threaded version.

Just for what it's worth, I did a bit more rewriting to get this:

// mandelbrot.cpp
// compile with: g++ -std=c++11 mandelbrot.cpp -o mandelbrot
// view output with: eog mandelbrot.ppm

#include <fstream>
#include <complex> // if you make use of complex number facilities in C++
#include <iostream>
#include <cstdlib>
#include <thread>
#include <mutex>
#include <vector>

using namespace std;

template <class T> struct RGB { T r, g, b; };

template <class T>
struct Matrix
{
    std::vector<T> data;
    size_t rows;
    size_t cols;

    class proxy {
        Matrix &m;
        size_t index_1;
    public:
        proxy(Matrix &m, size_t index_1) : m(m), index_1(index_1) { }

        T &operator[](size_t index) { return m.data[index * m.rows + index_1]; }
    };

    class const_proxy {
        Matrix const &m;
        size_t index_1;
    public:
        const_proxy(Matrix const &m, size_t index_1) : m(m), index_1(index_1) { }

        T const &operator[](size_t index) const { return m.data[index * m.rows + index_1]; }
    };


public:
    Matrix(size_t rows, size_t cols) : data(rows * cols), rows(rows), cols(cols) { }

    proxy operator[](size_t index) { return proxy(*this, index); }
    const_proxy operator[](size_t index) const { return const_proxy(*this, index); }

};

template <class T>
std::ostream &operator<<(std::ostream &out, Matrix<T> const &m) {
    out << "P6" << std::endl << m.cols << " " << m.rows << std::endl << 255 << std::endl;
    for (size_t y = 0; y < m.rows; y++)
        for (size_t x = 0; x < m.cols; x++) {
            T pixel = m[y][x];
            out << pixel.r << pixel.g << pixel.b;
        }
    return out;
}

/*Draw Mandelbrot according to the provided parameters*/
template <class T>
void draw_Mandelbrot(T & image, const unsigned width, const unsigned height, double cxmin, double cxmax, double cymin, double cymax, unsigned int max_iterations) {

#pragma omp parallel for
    for (int ix = 0; ix < width; ++ix)
        for (int iy = 0; iy < height; ++iy)
        {
            std::complex<double> c(cxmin + ix / (width - 1.0)*(cxmax - cxmin), cymin + iy / (height - 1.0)*(cymax - cymin));
            std::complex<double> z = 0;
            unsigned int iterations;

            for (iterations = 0; iterations < max_iterations && std::abs(z) < 2.0; ++iterations)
                z = z*z + c;

            image[iy][ix].r = image[iy][ix].g = image[iy][ix].b = iterations;

        }
}

int main() {
    const unsigned width = 1600;
    const unsigned height = 1600;

    Matrix<RGB<unsigned char>> image(height, width);

    clock_t start = clock();
    draw_Mandelbrot(image, width, height, -2.0, 0.5, -1.0, 1.0, 255);
    clock_t stop = clock();

    std::cout << (double(stop - start) / CLOCKS_PER_SEC) << " seconds\n";

    std::ofstream out("mandelbrot.ppm", std::ios::binary);
    out << image;

    return 0;
}

On my machine, this code runs in about 0.5 to 0.6 seconds.

As to why I made these changes: mostly to make it faster, cleaner, and simpler. Your Matrix class allocated a separate block of memory for each row (or perhaps column--didn't pay very close of attention). This allocates one contiguous block of the entire matrix instead. This eliminates a level of indirection to get to the data, and increases locality of reference, thus improving cache usage. It also reduces the total amount of data used.

Changing from using time to using clock to do the timing was to measure CPU time instead of wall time (and typically improve precision substantially as well).

Getting rid of the PPMImage class was done simply because (IMO) having a PPImage class that derives from a Matrix class just doesn't make much (if any) sense. I suppose it works (for a sufficiently loose definition of "work") but it doesn't strike me as good design. If you insist on doing it at all, it should at least be private derivation, because you're just using the Matrix as a way of implementing your PPMImage class, not (at least I certainly hope not) trying to make assertions about properties of PPM images.

If, for whatever, reason, you decide to handle the threading manually, the obvious way of dividing the work up between threads would still be by looking at the loops inside of draw_mandelbrot. The obvious one would be to leave your outer loop alone, but send the computation for each iteration off to a thread pool: for (int ix = 0; ix < width; ++ix) compute_thread(ix);

where the body of compute_thread is basically this chunk of code:

        for (int iy = 0; iy < height; ++iy)
        {
            std::complex<double> c(cxmin + ix / (width - 1.0)*(cxmax - cxmin), cymin + iy / (height - 1.0)*(cymax - cymin));
            std::complex<double> z = 0;
            unsigned int iterations;

            for (iterations = 0; iterations < max_iterations && std::abs(z) < 2.0; ++iterations)
                z = z*z + c;

            image[iy][ix].r = image[iy][ix].g = image[iy][ix].b = iterations;

        }

There would obviously be a little work involved in passing the correct data to the compute thread (each thread should be pass a reference to a slice of the resulting picture), but that would be an obvious and fairly clean place to divide things up. In particular it divides the job up into enough tasks that you semi-automatically get pretty good load balancing (i.e., you can keep all the cores busy) but large enough that you don't waste massive amounts of time on communication and synchronization between the threads.

As to the result, with the number of iterations set to 255, I get the following (scaled to 25%):

enter image description here

...which is pretty much as I'd expect.

回答2:

One of the big issues with this approach is that different regions take different amounts of time to calculate.

A more general approach is.

Start 1 source thread.
Start N worker threads.
Start 1 sink thread.
Create 2 thread safe queues (call them the source queue and the sink queue).
Divide the image into M (many more than N) pieces.
The source thread pushes pieces into the source queue
The workers pull piecse from the source queue, convert the pieces into result fragments, and pushes those fragments into the sink queue.
The sink thread takes fragments from the sink queue and combines them into the final image.

By dividing up the work this way, all the worker threads will be busy all the time.

回答3:

You can divide the fractal into pieces by divide the start and end of the fractal with the screen dimension:

   $this->stepsRe = (double)((($this->startRe * -1) + ($this->endeRe)) / ($this->size_x-1));
   $this->stepsIm = (double)((($this->startIm * -1) + ($this->endeIm)) / ($this->size_y-1));

来源：https://stackoverflow.com/questions/30070348/multithreading-computation-of-mandelbrot-set

标签

c++

multithreading

fractals

mandelbrot