2nd level row cache size - C ++

Question

Hello. I am writing a program to determine the size of the second-level cache line, I used the article habrahabr.ru/post/93263. But I get completely different results. Through the Coreinfo program, I learned that the size of the first and second level strings is 64 bytes each. So far I have decided to get at least the size of the first level row, but some completely inadequate results are obtained.

#include "stdafx.h" #include <time.h> #include <iostream> #include <string> using namespace std; int main() { int t; const int N = 8000; volatile int arr[N]; unsigned int A; char ask1 = 'y'; srand(time(NULL)); while (ask1 == 'y') { for (int j = 0; j < N; j++) arr[j] = rand(); for (int i = 1; i <= 64; i++) { A = clock(); for (int k = 0; k < 100000; k++) for (int j = 0; j < N; j += i) { t = arr[j]; } //cout << i << "\tstep, time\t" << clock() - A << '\t' << t << endl; arr[i] = (clock() - A); //Вместо печати, чтобы не рушить кэш, я решил занести время выполнения в тот же самый массив, а потом его вывести } for (int i = 1; i <= 64; i++) cout << i << "\tstep, time\t" << arr[i] << endl; cout << "Repeat?(y/n): "; cin >> ask1; cout << endl; } /**/ const int n = 1600000000; int l; unsigned int a; char ask = 'y'; srand(time(NULL)); //это просто для рандома, чтобы он был разный, хотя тут мне это и не особо нужно while (ask == 'y') { volatile int byte8[2]; for (int j = 0; j < 2; j++) byte8[j] = rand(); //рандомим массив a = clock();//записываем время до циклов считывания массива for (int k = 0; k < n / 2; k++) /*делим n на количество повторений внутреннего цикла, чтобы везде в сумме получилось одинаковое число повторений */ for (int i = 0; i < 2; i++) l = byte8[i]; cout << size(byte8) * 4 << "\tbytes\t" << clock() - a << endl; //выводим количество ms понадобившихся для считывания //далее повторяем то же самое для массивов большей длинны volatile int byte16[4]; for (int j = 0; j < 4; j++) byte16[j] = rand(); a = clock(); for (int k = 0; k < n / 4; k++) for (int i = 0; i < 4; i++) l = byte16[i]; cout << size(byte16) * 4 << "\tbytes\t" << clock() - a << endl; volatile int byte32[8]; for (int j = 0; j < 8; j++) byte32[j] = rand(); a = clock(); for (int k = 0; k < n / 8; k++) for (int i = 0; i < 8; i++) l = byte32[i]; cout << size(byte32) * 4 << "\tbytes\t" << clock() - a << endl; /* int byte60[15]; for (int j = 0; j < 15; j++) byte60[j] = rand(); a = clock(); for (int k = 0; k < n / 15; k++) for (int i = 0; i < 15; i++) l = byte60[i]; cout << size(byte60) * 4 << "\tbytes\t" << clock() - a << endl; */ volatile int byte64[16]; for (int j = 0; j < 16; j++) byte64[j] = rand(); a = clock(); for (int k = 0; k < n / 16; k++) for (int i = 0; i < 16; i++) l = byte64[i]; cout << size(byte64) * 4 << "\tbytes\t" << clock() - a << endl; /* int byte68[17]; for (int j = 0; j < 17; j++) byte68[j] = rand(); a = clock(); for (int k = 0; k < n / 17; k++) for (int i = 0; i < 17; i++) l = byte68[i]; cout << size(byte68) * 4 << "\tbytes\t" << clock() - a << endl; */ volatile int byte96[24]; for (int j = 0; j < 24; j++) byte96[j] = rand(); a = clock(); for (int k = 0; k < n / 24; k++) for (int i = 0; i < 24; i++) l = byte96[i]; cout << size(byte96) * 4 << "\tbytes\t" << clock() - a << endl; volatile int byte128[32]; for (int j = 0; j < 32; j++) byte128[j] = rand(); a = clock(); for (int k = 0; k < n / 32; k++) for (int i = 0; i < 32; i++) l = byte128[i]; cout << size(byte128) * 4 << "\tbytes\t" << clock() - a << endl; volatile int byte192[48]; for (int j = 0; j < 48; j++) byte192[j] = rand(); a = clock(); for (int k = 0; k < n / 48; k++) for (int i = 0; i < 48; i++) l = byte192[i]; cout << size(byte192) * 4 << "\tbytes\t" << clock() - a << endl; volatile int byte256[64]; for (int j = 0; j < 64; j++) byte256[j] = rand(); a = clock(); for (int k = 0; k < n / 64; k++) for (int i = 0; i < 64; i++) l = byte256[i]; cout << size(byte256) * 4 << "\tbytes\t" << clock() - a << endl; volatile int byte512[128]; for (int j = 0; j < 128; j++) byte512[j] = rand(); a = clock(); for (int k = 0; k < n / 128; k++) for (int i = 0; i < 128; i++) l = byte512[i]; cout << size(byte512) * 4 << "\tbytes\t" << clock() - a << endl; cout << "Repeat?(y/n): "; cin >> ask; cout << endl; } system("pause"); return 0; }

The fact is that I cannot use functions of the GetLogicalProcessorInformation type; I need to define a test. I launch in Visual Studio 2017 with a configuration on Release

The article is old, can the compiler become smarter or does some auto-overclocking work in the processor?
what is there in the second cycle is considered, I did not understand ...

Fat zer fat zer 11.1k five 21 · Answer 1 · 2017-12-25T03:03:47

In the first cycle, the array is too small (8000 * 4 <32K): it fits completely in the L1 cache, and then the CPU is threshed again and again without accessing the memory. Therefore, time, as expected, is inversely proportional to i.

An example of the correct option:

(C; amd64; requires 1G + memory)

 #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <time.h> #include <assert.h> typedef uint64_t my_int_t; void mesureArrayMult (my_int_t *arr, size_t sz, size_t step) { clock_t start_time = clock(); for (size_t i = 0; i < sz; i += step) { arr[i]*=i; } double work_time_us = (clock() - start_time) * (1000. / CLOCKS_PER_SEC); printf ("%4zd (%4zd): %7.3lfus\n", step, sizeof(my_int_t) * step, work_time_us); } int main() { const size_t sz = 128*1024*1024; my_int_t *arr = (my_int_t *) malloc (sizeof(arr[0]) * sz); assert(arr); memset (arr, 1, sizeof(arr[0])*sz); size_t i=1; for ( ; i< 8; i+= 1) { mesureArrayMult( arr, sz, i ); } // for ( ; i<16; i+= 2) { mesureArrayMult( arr, sz, i ); } for ( ; i<=1024; i*=2) { mesureArrayMult( arr, sz, i ); } free (arr); return 0; }

Generally speaking, over-optimization will not spoil the result, but in order to be calm about this, it is advisable to disable embedding (-fno-inline on gcc and the like).

The output, as expected, for the step <= 64 bytes is approximately the same, for larger ones it is inversely proportional to the step (with considerable error):

  1 ( 8): 122.007us 2 ( 16): 122.161us 3 ( 24): 120.396us 4 ( 32): 119.503us 5 ( 40): 120.545us 6 ( 48): 121.657us 7 ( 56): 121.623us 8 ( 64): 122.529us 16 ( 128): 82.049us 32 ( 256): 46.452us 64 ( 512): 20.682us 128 (1024): 9.905us 256 (2048): 5.207us 512 (4096): 2.986us 1024 (8192): 1.570us

The only thing I would like to ask you is, for all the caches for which cache did we get the result?
@Larteezy; According to the logic of things, in order for which the array does not fit entirely, that is, in my case for L3.
But on x86 * all the lines are the same ... I don’t know if there are caches with different lines ...

2nd level row cache size - C ++

1 answer 1

An example of the correct option:

More articles: