How long does thread creation and termination take under Windows?

前端未结

关注

 3  666

I\'ve split a complex array processing task into a number of threads to take advantage of multi-core processing and am seeing great benefits. Currently, at the start of the

相关标签:

3条回答

猫巷女王i

2020-12-15 18:55
Some advices:
1. If you have lots of work items to process (or there aren't too many, but you have to repeat the whole process time to time), make sure you use some kind of thread pooling. This way you won't have to recreate the threads all the time, and your original question won't matter any more: the threads will be created only one time. I use the QueueUserWorkItem API directly (since my application doesn't use MFC), even that one is not too painful. But in MFC you may have higher level facilities to take advantage of the thread pooling. (http://support.microsoft.com/kb/197728)
2. Try to select the optimal amount of work for one work item. Of course this depends on the feature of your software: is it supposed to be real time, or it's a number crunching in the background? If it's not real-time, then too small amount of work per work item can hurt performance: by increasing the proportion of overhead of the work distribution across threads.
3. Since hardware configurations can be very different, if your end-users can have various machines you can include some calibration routines asynchronously during the start of the software, so you can estimate how much time certain operation takes. The result of the calibration can be an input for a better work size setting later for the real calculations.
0 讨论(0)
发布评论:

提交评论
- 加载中...

情深已故

2020-12-15 18:59

I wrote this quite a while ago when I had the same basic question (along with another that will be obvious). I've updated it to show a little more about not only how long it takes to create threads, but how long it takes for the threads to start executing:

#include <windows.h>
#include <iostream>
#include <time.h>
#include <vector>

const int num_threads = 32;

const int switches_per_thread = 100000;

DWORD __stdcall ThreadProc(void *start) {
    QueryPerformanceCounter((LARGE_INTEGER *) start);
    for (int i=0;i<switches_per_thread; i++)
        Sleep(0);
    return 0;
}

int main(void) {
    HANDLE threads[num_threads];
    DWORD junk;

    std::vector<LARGE_INTEGER> start_times(num_threads);

    LARGE_INTEGER l;
    QueryPerformanceCounter(&l);

    clock_t create_start = clock();
    for (int i=0;i<num_threads; i++)
        threads[i] = CreateThread(NULL, 
                            0, 
                            ThreadProc, 
                            (void *)&start_times[i], 
                            0, 
                            &junk);
    clock_t create_end = clock();

    clock_t wait_start = clock();
    WaitForMultipleObjects(num_threads, threads, TRUE, INFINITE);
    clock_t wait_end = clock();

    double create_millis = 1000.0 * (create_end - create_start) / CLOCKS_PER_SEC / num_threads;
    std::cout << "Milliseconds to create thread: " << create_millis << "\n";
    double wait_clocks = (wait_end - wait_start);
    double switches = switches_per_thread*num_threads;
    double us_per_switch = wait_clocks/CLOCKS_PER_SEC*1000000/switches;
    std::cout << "Microseconds per thread switch: " << us_per_switch;

    LARGE_INTEGER f;
    QueryPerformanceFrequency(&f);

    for (auto s : start_times) 
        std::cout << 1000.0 * (s.QuadPart - l.QuadPart) / f.QuadPart <<" ms\n";

    return 0;
}

Sample results:

Milliseconds to create thread: 0.015625
Microseconds per thread switch: 0.0479687

The first few thread start times look like this:

0.0632517 ms
0.117348 ms
0.143703 ms
0.18282 ms
0.209174 ms
0.232478 ms
0.263826 ms
0.315149 ms
0.324026 ms
0.331516 ms
0.3956 ms
0.408639 ms
0.4214 ms

Note that although these happen to be monotonically increasing, that's not guaranteed (though there is definitely a trend in that general direction).

When I first wrote this, the units I used made more sense -- on a 33 MHz 486, those results weren't tiny fractions like this. :-) I suppose someday when I'm feeling ambitious, I should rewrite this to use std::async to create the threads and std::chrono to do the timing, but...

0 讨论(0)

暗喜

2020-12-15 19:03

I was curious about the modern Windows scheduler, so I wrote another test app. I made my best attempt at measuring thread stop time by optionally spinning up a watching thread.

// Tested on Windows 10 v1903 with E5-1660 v3 @ 3.00GHz, 8 Core(s), 16 Logical Processor(s)
// Times are (min, average, max) in milliseconds.

threads: 100, iterations: 1, testStop: true
Start(0.1083, 5.3665, 13.7103) - Stop(0.0341, 1.5122, 11.0660)

threads: 32, iterations: 3, testStop: true
Start(0.1349, 1.6423, 3.5561) - Stop(0.0396, 0.2877, 3.5195)
Start(0.1093, 1.4992, 3.3982) - Stop(0.0351, 0.2734, 2.0384)
Start(0.1159, 1.5345, 3.5754) - Stop(0.0378, 0.4938, 3.2216)

threads: 4, iterations: 3, testStop: true
Start(0.2066, 0.3553, 0.4598) - Stop(0.0410, 0.1534, 0.4630)
Start(0.2769, 0.3740, 0.4994) - Stop(0.0414, 0.1028, 0.2581)
Start(0.2342, 0.3602, 0.5650) - Stop(0.0497, 0.2199, 0.3620)

threads: 4, iterations: 3, testStop: false
Start(0.1698, 0.2492, 0.3713)
Start(0.1473, 0.2477, 0.4103)
Start(0.1756, 0.2909, 0.4295)

threads: 1, iterations: 10, testStop: false
Start(0.1910, 0.1910, 0.1910)
Start(0.1685, 0.1685, 0.1685)
Start(0.1564, 0.1564, 0.1564)
Start(0.1504, 0.1504, 0.1504)
Start(0.1389, 0.1389, 0.1389)
Start(0.1234, 0.1234, 0.1234)
Start(0.1550, 0.1550, 0.1550)
Start(0.2800, 0.2800, 0.2800)
Start(0.1587, 0.1587, 0.1587)
Start(0.1877, 0.1877, 0.1877)

Source:

#include <windows.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <iomanip>

using namespace std::chrono;

struct Test
{
    HANDLE Thread = { 0 };
    time_point<steady_clock> Creation;
    time_point<steady_clock> Started;
    time_point<steady_clock> Stopped;
};

DWORD __stdcall ThreadProc(void* lpParamater) {
    auto test = (Test*)lpParamater;
    test->Started = steady_clock::now();
    return 0;
}

DWORD __stdcall TestThreadsEnded(void* lpParamater) {
    auto& tests = *(std::vector<Test>*)lpParamater;

    std::size_t finished = 0;
    while (finished < tests.size())
    {
        for (auto& test : tests)
        {
            if (test.Thread != NULL && WaitForSingleObject(test.Thread, 0) == WAIT_OBJECT_0)
            {
                test.Stopped = steady_clock::now();
                test.Thread = NULL;
                finished++;
            }
        }
    }

    return 0;
}

duration<double, std::milli> diff(time_point<steady_clock> start, time_point<steady_clock> stop)
{
    return stop - start;
}

struct Stats
{
    double min;
    double average;
    double max;
};

Stats stats(const std::vector<double>& durations)
{
    Stats stats = { 1000, 0, 0 };

    for (auto& duration : durations)
    {
        stats.min = duration < stats.min ? duration : stats.min;
        stats.max = duration > stats.max ? duration : stats.max;
        stats.average += duration;
    }

    stats.average /= durations.size();

    return stats;
}

void TestScheduler(const int threadCount, const int iterations, const bool testStop)
{
    std::cout << "\nthreads: " << threadCount << ", iterations: " << iterations << ", testStop: " << (testStop ? "true" : "false") << "\n";

    for (auto i = 0; i < iterations; i++)
    {
        std::vector<Test> tests(threadCount);
        HANDLE testThreadsEnded = NULL;

        if (testStop)
        {
            testThreadsEnded = CreateThread(NULL, 0, TestThreadsEnded, (void*)& tests, 0, NULL);
        }

        for (auto& test : tests)
        {
            test.Creation = steady_clock::now();
            test.Thread = CreateThread(NULL, 0, ThreadProc, (void*)& test, 0, NULL);
        }

        if (testStop)
        {
            WaitForSingleObject(testThreadsEnded, INFINITE);
        }
        else
        {
            std::vector<HANDLE> threads;
            for (auto& test : tests) threads.push_back(test.Thread);
            WaitForMultipleObjects((DWORD)threads.size(), threads.data(), TRUE, INFINITE);
        }

        std::vector<double> startDurations;
        std::vector<double> stopDurations;
        for (auto& test : tests)
        {
            startDurations.push_back(diff(test.Creation, test.Started).count());
            stopDurations.push_back(diff(test.Started, test.Stopped).count());
        }

        auto startStats = stats(startDurations);
        auto stopStats = stats(stopDurations);

        std::cout << std::fixed << std::setprecision(4);
        std::cout << "Start(" << startStats.min << ", " << startStats.average << ", " << startStats.max << ")";
        if (testStop)
        {
            std::cout << " - ";
            std::cout << "Stop(" << stopStats.min << ", " << stopStats.average << ", " << stopStats.max << ")";
        }
        std::cout << "\n";
    }
}

int main(void)
{
    TestScheduler(100, 1, true);
    TestScheduler(32, 3, true);
    TestScheduler(4, 3, true);
    TestScheduler(4, 3, false);
    TestScheduler(1, 10, false);
    return 0;
}

0 讨论(0)