Measuring Cache Latencies

前端 未结 5 1651
感情败类
感情败类 2020-11-28 20:16

So I am trying to measure the latencies of L1, L2, L3 cache using C. I know the size of them and I feel I understand conceptually how to do it but I am running into problems

5条回答
  •  -上瘾入骨i
    2020-11-28 20:55

    Well for those interested, I couldn't get my first code set to work so I tried a couple alternative approaches that produced decent results.

    The first used linked lists with nodes allocated stride bytes apart in a contiguous memory space. The dereferencing of the nodes mitigates the effectiveness of the pre-fetcher and in the case that multiple cache lines are pulled in I made the strides significantly large to avoid cache hits. As the size of the list allocated increases, it jumps to the cache or memory structure that will hold it showing clear divisions in latency.

    #include 
    #include 
    #include 
    #include 
    #include 
    
    //MACROS
    #define ONE iterate = (char**) *iterate;
    #define FIVE ONE ONE ONE
    #define TWOFIVE FIVE FIVE FIVE FIVE FIVE
    #define HUNDO TWOFIVE TWOFIVE TWOFIVE TWOFIVE
    
    //prototype
    void allocateRandomArray(long double);
    void accessArray(char *, long double, char**);
    
    int main(){
        //call the function for allocating arrays of increasing size in MB
        allocateRandomArray(.00049);
        allocateRandomArray(.00098);
        allocateRandomArray(.00195);
        allocateRandomArray(.00293);
        allocateRandomArray(.00391);
        allocateRandomArray(.00586);
        allocateRandomArray(.00781);
        allocateRandomArray(.01172);
        allocateRandomArray(.01562);
        allocateRandomArray(.02344);
        allocateRandomArray(.03125);
        allocateRandomArray(.04688);
        allocateRandomArray(.0625);
        allocateRandomArray(.09375);
        allocateRandomArray(.125);
        allocateRandomArray(.1875);
        allocateRandomArray(.25);
        allocateRandomArray(.375);
        allocateRandomArray(.5);
        allocateRandomArray(.75);
        allocateRandomArray(1);
        allocateRandomArray(1.5);
        allocateRandomArray(2);
        allocateRandomArray(3);
        allocateRandomArray(4);
        allocateRandomArray(6);
        allocateRandomArray(8);
        allocateRandomArray(12);
        allocateRandomArray(16);
        allocateRandomArray(24);
        allocateRandomArray(32);
        allocateRandomArray(48);
        allocateRandomArray(64);
        allocateRandomArray(96);
        allocateRandomArray(128);
        allocateRandomArray(192);
    }
    
    void allocateRandomArray(long double size){
        int accessSize=(1024*1024*size); //array size in bytes
        char * randomArray = malloc(accessSize*sizeof(char));    //allocate array of size allocate size
        int counter;
        int strideSize=4096;        //step size
    
        char ** head = (char **) randomArray;   //start of linked list in contiguous memory
        char ** iterate = head;         //iterator for linked list
        for(counter=0; counter < accessSize; counter+=strideSize){      
            (*iterate) = &randomArray[counter+strideSize];      //iterate through linked list, having each one point stride bytes forward
            iterate+=(strideSize/sizeof(iterate));          //increment iterator stride bytes forward
        }
        *iterate = (char *) head;       //set tailf to point to head
    
        accessArray(randomArray, size, head);
        free(randomArray);
    }
    
    void accessArray(char *cacheArray, long double size, char** head){
        const long double NUM_ACCESSES = 1000000000/100;    //number of accesses to linked list
        const int SECONDS_PER_NS = 1000000000;      //const for timer
        FILE *fp =  fopen("accessData.txt", "a");   //open file for writing data
        int newIndex=0;
        int counter=0;
        int read=0;
        struct timespec startAccess, endAccess;     //struct for timer
        long double accessTime = 0;
        char ** iterate = head;     //create iterator
    
        clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
        for(counter=0; counter < NUM_ACCESSES; counter++){
            HUNDO       //macro subsitute 100 accesses to mitigate loop overhead
        }
        clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
        //calculate the time elapsed in ns per access
        accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (100*NUM_ACCESSES);
        fprintf(fp, "%Lf\t%Lf\n", accessTime, size);  //print results to file
        fclose(fp);  //close file
    }
    

    This produced the most consistent results, and using a variety of array sizes and plotting the respective latencies gave a very clear distinction of the different cache sizes present.

    The next method like the previous allocated increasing size arrays. But instead of using a linked list for memory access, I fill each index with its respective number and randomly shuffled the array. I then used these indexes to hop around randomly within the array for accesses, mitigating the effects of the pre-fetcher. However, it had an occasional strong deviation in access time when multiple adjacent cache lines are pulled in and happen to be hit.

    #include 
    #include 
    #include 
    #include 
    #include 
    
    //prototype
    void allocateRandomArray(long double);
    void accessArray(int *, long int);
    
    int main(){
        srand(time(NULL));  // Seed random function
        int i=0;
        for(i=2; i < 32; i++){
            allocateRandomArray(pow(2, i));         //call latency function on arrays of increasing size
        }
    
    
    }
    
    void allocateRandomArray(long double size){
        int accessSize = (size) / sizeof(int);
        int * randomArray = malloc(accessSize*sizeof(int));
        int counter;
    
        for(counter=0; counter < accessSize; counter ++){
            randomArray[counter] = counter; 
        }
        for(counter=0; counter < accessSize; counter ++){
            int i,j;
            int swap;
            i = rand() % accessSize;
            j = rand() % accessSize;
            swap = randomArray[i];
            randomArray[i] = randomArray[j];
            randomArray[j] = swap;
        } 
    
        accessArray(randomArray, accessSize);
        free(randomArray);
    }
    
    void accessArray(int *cacheArray, long int size){
        const long double NUM_ACCESSES = 1000000000;
        const int SECONDS_PER_NS = 1000000000;
        int newIndex=0;
        int counter=0;
        int read=0;
        struct timespec startAccess, endAccess;
        long double accessTime = 0;
    
        clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
        for(counter = 0; counter < NUM_ACCESSES; counter++){
            newIndex=cacheArray[newIndex];
        }
        clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
        //calculate the time elapsed in ns per access
        accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (NUM_ACCESSES);
        printf("Access time: %Lf for size %ld\n", accessTime, size);
    } 
    

    Averaged across many trials, this method produced relatively accurate results as well. The first choice is definitely the better of the two but this is an alternate approach that works fine as well.

提交回复
热议问题