So I am trying to measure the latencies of L1, L2, L3 cache using C. I know the size of them and I feel I understand conceptually how to do it but I am running into problems
Well for those interested, I couldn't get my first code set to work so I tried a couple alternative approaches that produced decent results.
The first used linked lists with nodes allocated stride bytes apart in a contiguous memory space. The dereferencing of the nodes mitigates the effectiveness of the pre-fetcher and in the case that multiple cache lines are pulled in I made the strides significantly large to avoid cache hits. As the size of the list allocated increases, it jumps to the cache or memory structure that will hold it showing clear divisions in latency.
#include
#include
#include
#include
#include
//MACROS
#define ONE iterate = (char**) *iterate;
#define FIVE ONE ONE ONE
#define TWOFIVE FIVE FIVE FIVE FIVE FIVE
#define HUNDO TWOFIVE TWOFIVE TWOFIVE TWOFIVE
//prototype
void allocateRandomArray(long double);
void accessArray(char *, long double, char**);
int main(){
//call the function for allocating arrays of increasing size in MB
allocateRandomArray(.00049);
allocateRandomArray(.00098);
allocateRandomArray(.00195);
allocateRandomArray(.00293);
allocateRandomArray(.00391);
allocateRandomArray(.00586);
allocateRandomArray(.00781);
allocateRandomArray(.01172);
allocateRandomArray(.01562);
allocateRandomArray(.02344);
allocateRandomArray(.03125);
allocateRandomArray(.04688);
allocateRandomArray(.0625);
allocateRandomArray(.09375);
allocateRandomArray(.125);
allocateRandomArray(.1875);
allocateRandomArray(.25);
allocateRandomArray(.375);
allocateRandomArray(.5);
allocateRandomArray(.75);
allocateRandomArray(1);
allocateRandomArray(1.5);
allocateRandomArray(2);
allocateRandomArray(3);
allocateRandomArray(4);
allocateRandomArray(6);
allocateRandomArray(8);
allocateRandomArray(12);
allocateRandomArray(16);
allocateRandomArray(24);
allocateRandomArray(32);
allocateRandomArray(48);
allocateRandomArray(64);
allocateRandomArray(96);
allocateRandomArray(128);
allocateRandomArray(192);
}
void allocateRandomArray(long double size){
int accessSize=(1024*1024*size); //array size in bytes
char * randomArray = malloc(accessSize*sizeof(char)); //allocate array of size allocate size
int counter;
int strideSize=4096; //step size
char ** head = (char **) randomArray; //start of linked list in contiguous memory
char ** iterate = head; //iterator for linked list
for(counter=0; counter < accessSize; counter+=strideSize){
(*iterate) = &randomArray[counter+strideSize]; //iterate through linked list, having each one point stride bytes forward
iterate+=(strideSize/sizeof(iterate)); //increment iterator stride bytes forward
}
*iterate = (char *) head; //set tailf to point to head
accessArray(randomArray, size, head);
free(randomArray);
}
void accessArray(char *cacheArray, long double size, char** head){
const long double NUM_ACCESSES = 1000000000/100; //number of accesses to linked list
const int SECONDS_PER_NS = 1000000000; //const for timer
FILE *fp = fopen("accessData.txt", "a"); //open file for writing data
int newIndex=0;
int counter=0;
int read=0;
struct timespec startAccess, endAccess; //struct for timer
long double accessTime = 0;
char ** iterate = head; //create iterator
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
for(counter=0; counter < NUM_ACCESSES; counter++){
HUNDO //macro subsitute 100 accesses to mitigate loop overhead
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
//calculate the time elapsed in ns per access
accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (100*NUM_ACCESSES);
fprintf(fp, "%Lf\t%Lf\n", accessTime, size); //print results to file
fclose(fp); //close file
}
This produced the most consistent results, and using a variety of array sizes and plotting the respective latencies gave a very clear distinction of the different cache sizes present.
The next method like the previous allocated increasing size arrays. But instead of using a linked list for memory access, I fill each index with its respective number and randomly shuffled the array. I then used these indexes to hop around randomly within the array for accesses, mitigating the effects of the pre-fetcher. However, it had an occasional strong deviation in access time when multiple adjacent cache lines are pulled in and happen to be hit.
#include
#include
#include
#include
#include
//prototype
void allocateRandomArray(long double);
void accessArray(int *, long int);
int main(){
srand(time(NULL)); // Seed random function
int i=0;
for(i=2; i < 32; i++){
allocateRandomArray(pow(2, i)); //call latency function on arrays of increasing size
}
}
void allocateRandomArray(long double size){
int accessSize = (size) / sizeof(int);
int * randomArray = malloc(accessSize*sizeof(int));
int counter;
for(counter=0; counter < accessSize; counter ++){
randomArray[counter] = counter;
}
for(counter=0; counter < accessSize; counter ++){
int i,j;
int swap;
i = rand() % accessSize;
j = rand() % accessSize;
swap = randomArray[i];
randomArray[i] = randomArray[j];
randomArray[j] = swap;
}
accessArray(randomArray, accessSize);
free(randomArray);
}
void accessArray(int *cacheArray, long int size){
const long double NUM_ACCESSES = 1000000000;
const int SECONDS_PER_NS = 1000000000;
int newIndex=0;
int counter=0;
int read=0;
struct timespec startAccess, endAccess;
long double accessTime = 0;
clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
for(counter = 0; counter < NUM_ACCESSES; counter++){
newIndex=cacheArray[newIndex];
}
clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
//calculate the time elapsed in ns per access
accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (NUM_ACCESSES);
printf("Access time: %Lf for size %ld\n", accessTime, size);
}
Averaged across many trials, this method produced relatively accurate results as well. The first choice is definitely the better of the two but this is an alternate approach that works fine as well.