Multi-threading performance much worse on Windows 10 than Linux

问题

I ported a multi-threaded Linux application to Windows and am testing it on a server running Windows 10 Pro. The performance of the Windows version is abysmal compared to the performance of the Linux version running on the same dual-boot hardware. I simplified the code to a small multi-threaded example that exhibits the same symptoms. I am hoping that the SO community can provide some insight as to why there are such performance differences between Windows and Linux for this application, and suggestions on how to remedy the problem.

The machine I'm testing on has dual Intel Xeon Gold 6136 CPUs (24/48 physical/logical cores) @3.0 GHz (Turbo-boost to 3.6 GHz) with 128 GB of memory. The machine is setup to dual-boot CentOS or Windows 10. There is no Windows Hypervisor running (Hyper-V is disabled). NUMA is disabled. In the testing I am performing, each thread should be able to run on a separate core; there are no other processor-consuming applications running.

The application performs complex transformations to convert input data sets of ~15 MB to output data of ~50 MB. I wrote simplified multi-threaded tests (computation only, data movement only, etc) to narrow down the issue. A computation-only test showed no performance differences, but a data-copy scenario did. The repeatable scenario is simply to have each thread copy data from its 15 MB input buffer to its 50 MB output buffer. Each 'int' in the input buffer is written consecutively to the output buffer 3 times. Results from virtually identical Linux and Windows code for 100 iterations with N threads are shown below:

          Windows (or cygwin)        Linux (native)
Threads   Time (msec)                Time (msec)
1         4200                       3000
2         4020                       2300
3         4815                       2300
4         6700                       2300
5         8900                       2300
6         14000                      2300
7         16500                      2300
8         21000                      2300
12        39000                      2500
16        75000                      3000
24        155000                     4000

The times above are the processing time in the worker threads. The results do not include any time for allocating memory or starting the threads. It seems that threads are running independently under Linux but are not under Windows 10.

The full C code I used for Windows testing is here:

//
// Thread test program
//
// To compile for Windows:
//      vcvars64.bat
//      cl /Ox -o windowsThreadTest windowsThreadTest.c
//

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <windows.h>
#include <process.h>

#define __func__ __FUNCTION__

//
// Global data
//
HANDLE *threadHandleArray = NULL;
DWORD *threadIdArray = NULL;

//
// Time keeping
//
double *PCFreq = NULL;
__int64 *CounterStart = NULL;

void StartCounter(int whichProcessor)
{
    LARGE_INTEGER li;
    DWORD_PTR old_mask;

    if ( !PCFreq )
    {
        printf("No freq array\n");
        return;
    }

    if(!QueryPerformanceFrequency(&li))
    {
        printf("QueryPerformanceFrequency failed!\n");
        return;
    }

    PCFreq[whichProcessor] = ((double)(li.QuadPart))/1000.0;

    QueryPerformanceCounter(&li);
    CounterStart[whichProcessor] = li.QuadPart;

}

double GetCounter()
{
    LARGE_INTEGER li;
    DWORD_PTR old_mask;
    DWORD whichProcessor;
    whichProcessor = GetCurrentProcessorNumber();

    if ( CounterStart && CounterStart[whichProcessor] != 0 )
    {
        QueryPerformanceCounter(&li);
        return ((double)(li.QuadPart-CounterStart[whichProcessor]))/PCFreq[whichProcessor];
    }
    else
        return 0.0;
}


typedef struct
{
    int retVal;
    int instance;
    long myTid;
    int verbose;
    double startTime;
    double elapsedTime;
    double totalElapsedTime;
    struct {
        unsigned intsToCopy;
        int *inData;
        int *outData;
    } rwInfo;
} info_t;

int rwtest( unsigned intsToCopy, int *inData, int *outData)
{
    unsigned i, j;

    //
    // Test is simple.  For every entry in input array, write 3 entries to output
    //
    for ( j = i = 0; i < intsToCopy; i++ )
    {
        outData[j] = inData[i];
        outData[j+1] = inData[i];
        outData[j+2] = inData[i];
        j += 3;
    }
    return 0;
}

DWORD WINAPI workerProc(LPVOID *workerInfoPtr)
{
    info_t *infoPtr = (info_t *)workerInfoPtr;
    infoPtr->myTid = GetCurrentThreadId();
    double endTime;
    BOOL result;

    SetThreadPriority(threadHandleArray[infoPtr->instance], THREAD_PRIORITY_HIGHEST);

    // record start time
    infoPtr->startTime = GetCounter();

    // Run the test
    infoPtr->retVal = rwtest( infoPtr->rwInfo.intsToCopy, infoPtr->rwInfo.inData, infoPtr->rwInfo.outData );

    // end time
    endTime = GetCounter();
    infoPtr->elapsedTime = endTime - infoPtr->startTime;

    if ( infoPtr->verbose )
        printf("(%04x): done\n", infoPtr->myTid);

    return 0;
}

//
// Main Test Program
//

int main(int argc, char **argv)
{

    int i, j, verbose=0, loopLimit;
    unsigned size;
    unsigned int numThreads;
    info_t *w_info = NULL;
    int numVirtualCores;
    SYSTEM_INFO sysinfo;
    GetSystemInfo(&sysinfo);

    if ( argc != 4 )
    {
        printf("windowsThreadTest <numLoops> <numThreads> <Input size in MB>\n");
        return -1;
    }

    numVirtualCores = sysinfo.dwNumberOfProcessors;
    printf("%s: There are %d processors\n", __func__, numVirtualCores);

    // Setup Timing
    PCFreq = (double *)malloc(numVirtualCores * sizeof(double));
    CounterStart = (__int64 *)malloc(numVirtualCores * sizeof(__int64));
    if (!PCFreq || !CounterStart)
        goto free_and_exit;

    for ( i = 0; i < numVirtualCores; i++)
        StartCounter(i);

    //
    // Process  input args
    //
    loopLimit = atoi( argv[1] );
    numThreads = atoi( argv[2] );
    size = atoi( argv[3] ) * 1024 * 1024;

    //
    // Setup data array for each thread
    //
    w_info = (info_t *)malloc( numThreads * sizeof(info_t) );
    if ( !w_info )
    {
        printf("Couldn't allocate w_info of size %zd, numThreads=%d\n", sizeof(info_t), numThreads);
        goto free_and_exit;
    }
    memset( w_info, 0, numThreads * sizeof(info_t) );

    //
    // Thread Handle Array
    //
    threadHandleArray = (HANDLE *)malloc( numThreads * sizeof(HANDLE) );
    if ( !threadHandleArray )
    {
        printf("Couldn't allocate handleArray\n");
        goto free_and_exit;
    }

    //
    // Thread ID Array
    //
    threadIdArray = (DWORD *)malloc( numThreads * sizeof(DWORD) );
    if ( !threadIdArray )
    {
        printf("Couldn't allocate IdArray\n");
        goto free_and_exit;
    }

    //
    // Run the test
    //
    printf("Read/write testing... threads %d loops %lu input size %u \n", numThreads, loopLimit, size);

    for ( j = 0; j < loopLimit; j++ )
    {
        //
        // Set up the data for the threads
        //
        for ( i = 0; i < numThreads; i++ )
        {
            int idx;
            int *inData;
            int *outData;
            unsigned inSize;
            unsigned outSize;

            inSize = size;          // in MB
            outSize = size * 3;     // in MB

            //
            // Allocate input buffer
            //
            inData = (int *) malloc( inSize );
            if ( !inData )
            {
                printf("Error allocating inData of size %zd\n", inSize * sizeof(char));
                goto free_and_exit;
            }
            else
            {
                if ( verbose )
                    printf("Allocated inData of size %zd\n", inSize * sizeof(char));
            }

            //
            // Allocate output buffer 3x the size of the input buf
            //
            outData = (int *) malloc( outSize * 3 );
            if ( !outData )
            {
                printf("Error allocating outData of size %zd\n", outSize * sizeof(char));
                goto free_and_exit;
            }
            else
            {
                if ( verbose )
                    printf("Allocated outData of size %zd\n", outSize * sizeof(char));
            }

            //
            // Put some data into input buffer
            //
            w_info[i].rwInfo.intsToCopy = inSize/sizeof(int);

            for ( idx = 0; idx < w_info[i].rwInfo.intsToCopy; idx++)
                inData[idx] = idx;

            w_info[i].rwInfo.inData = inData;
            w_info[i].rwInfo.outData = outData;

            w_info[i].verbose = verbose;
            w_info[i].instance = i;
            w_info[i].retVal = -1;
        }

        //
        // Start the threads
        //
        for ( i = 0; i < numThreads; i++ )
        {
            threadHandleArray[i] = CreateThread( NULL, 0, workerProc, &w_info[i], 0, &threadIdArray[i] );
            if ( threadHandleArray[i] == NULL )
            {
                fprintf(stderr, "Error creating thread %d\n", i);
                return 1;
            }
        }

        //
        // Wait until all threads have terminated.
        //
        WaitForMultipleObjects( numThreads, threadHandleArray, TRUE, INFINITE );

        //
        // Check the return values
        //
        for ( i = 0; i < numThreads; i++ )
        {
            if ( w_info[i].retVal < 0 )
            {
                printf("Error return from thread %d\n", i);
                goto free_and_exit;
            }
            if ( verbose )
                printf("Thread %d, tid %x %f msec\n", i, (unsigned)w_info[i].myTid, w_info[i].elapsedTime);
            w_info[i].totalElapsedTime += w_info[i].elapsedTime;
        }

        //
        // Free up the data from this iteration
        //
        for ( i = 0; i < numThreads; i++ )
        {
            free( w_info[i].rwInfo.inData );
            free( w_info[i].rwInfo.outData );
            CloseHandle( threadHandleArray[i] );
        }
    }

    //
    // All done, print out cumulative time spent in worker routine
    //
    for ( i = 0; i < numThreads; i++ )
    {
        printf("Thread %d, loops %d %f msec\n", i, j, w_info[i].totalElapsedTime);
    }

free_and_exit:

    if ( threadHandleArray )
        free( threadHandleArray );

    if ( threadIdArray )
        free( threadIdArray );

    if ( PCFreq )
        free( PCFreq );

    if ( CounterStart )
        free( CounterStart );

    if ( w_info )
        free( w_info );

    return 0;
}

The code above was easily changed to utilize pthreads, compiling with the command line 'gcc -O3 -o pthreadTestLinux pthreadTest.c' to obtain the Linux results described above (I can post if necessary). If compiled on Windows with gcc in a cygwin environment, the results mirror those using the Windows sample code.

I've experimented with various BIOS settings, raising the thread priority, pre-allocated thread pools, etc with no change in the performance. I don't think this is a case of false-sharing due to the fact that the Linux version displays radically different performance with virtually identical code. I'm wondering if there is something in how I'm compiling. I am using the 64-bit toolchain.

Any ideas?

回答1:

I've seen similar issues with Cygwin apps on multicore/multiprocessor machines. As far as I know, this is still an unsolved problem in Cygwin.

One thing I noticed, and you can try, is that pinning the process to a single CPU may dramatically improve its performance (but obviously will also limit the ability to take advantage of multicore and multithread parallelism). You can pin the process to a single CPU by using Windows task manager to set the process affinity to just one CPU/core.

If doing so improves the performance of a single thread significantly, then you're seeing the same problem I've noticed. And, I don't believe it's a problem with your code then, but a problem with Cygwin.

回答2:

Was curious to see how the Windows performance for this compared to the Linux performance for the multi-threaded memory transforming issue in golang, so I ported the code to as close to the original as possible and then did a few of the same performance tests on a similar hardware platform.

Unlike the results seen in the posted question, the golang code did not blow up as the number of simultaneous operations increased. The corresponding performance chart is:

Num Threads      Time in Process
    1                 4000
    2                 4100
    4                 4200
    6                 3600
    12                3600
    16                3800
    24                3700

These results are significantly slower than what you show in the C code running on Linux.

Not sure if any of this is helpful, but it looks like there is a general issue with Windows 10 causing multi-threaded performance issues when doing memory some operations, but also there seems to be a correlation with the performance of the C code when compiled by both cl and gcc (cygwin) as you describe in your question.

The golang code is:

package main

import "fmt"
import "os"
import "time"
import "strconv"


func rwtest(intsToCopy int, inData *[]int, outData *[]int) {
    var i int
    var j int

    j = 0

    for i=0 ; i<intsToCopy ; i++ {
        (*outData)[j + 0] = (*inData)[i]
        (*outData)[j + 1] = (*inData)[i]
        (*outData)[j + 2] = (*inData)[i]

        j += 3
    }
}


func workerProc(threadNum int, reportChan chan int, numLoops int, dataSize int) {
    var i int
    var inData []int
    var outData []int
    var cumulativeTime time.Duration

    cumulativeTime = 0

    for i=0 ; i<numLoops ; i++ {
        inData = make([]int, dataSize, dataSize)
        outData = make([]int, dataSize * 3, dataSize * 3)

        startTime := time.Now()

        rwtest(dataSize, &inData, &outData)

        endTime := time.Now()

        cumulativeTime += endTime.Sub(startTime)

        inData = nil
        outData = nil
    }

    // Print out the cumulative time
    fmt.Printf("Thread %d duration is %d\n", threadNum, cumulativeTime)

    // Write out to the channel
    reportChan <- 0

}


func main() {
    var i int

    if len(os.Args) != 4 {
        fmt.Printf("Usage: %s <num threads> <num loops> <data size>\n", os.Args[0])

        return
    }

    numThreads, _ := strconv.Atoi(os.Args[1])
    numLoops, _ := strconv.Atoi(os.Args[2])
    dataSize, _ := strconv.Atoi(os.Args[3])

    fmt.Printf("Running Program with %d threads, with %d loops\n", numThreads, numLoops)

    // Make a channel for each thread
    var chans []chan int

    for i=0 ; i<numThreads ; i++ {
        chans = append(chans, make(chan int))
    }

    // start the threads
    for i=0 ; i<numThreads ; i++ {
        go workerProc(i, chans[i], numLoops, dataSize)
    }

    var x int

    // Loop through the channels, waiting for each go routine to finish
    for i=0 ; i<numThreads ; i++ {
        x = <-chans[i]
    }

    fmt.Printf("Done: %d\n", x)
}

回答3:

Youtubers Level 1 Techs was seeing this on Threadripper processors also. Long story short, is Windows 10 kernel seems to be shuffling threads between cores FAR FAR to much while the program is running. https://www.youtube.com/watch?v=M2LOMTpCtLA

I have no idea if this is a problem with Server 2016 or 2019 kernel also. Being a new owner of a Threadripper 2950x myself, I would really like to get this solved.

来源：https://stackoverflow.com/questions/51217320/multi-threading-performance-much-worse-on-windows-10-than-linux

标签

Linux

windows

multithreading

cygwin