Since the question is about the fastest implementation, then of course it makes sense to try the implementation in the simplest way: selecting int for each bit. Let me remind you that the issue was not economical about the implementation.
I used the @avp code to compare the speed. MSVC 2015, Release mode, x64.
Code:
#include "stdafx.h" #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <limits.h> #include <time.h> #include <iostream> #include <bitset> using namespace std; int main(int ac, char *av[]) { const int n = 1000000; bitset<1000000> bcp; printf("bcp %d\n", (int)sizeof(bcp)); const unsigned long ebits = (sizeof(unsigned long) * CHAR_BIT); const unsigned long sz = (n + sizeof(unsigned long) * CHAR_BIT - 1) / ebits; unsigned long bc[sz], bysz = sizeof(bc); printf("bc sz = %d ebits = %d bysz = %d\n", (int)(sz), (int)ebits, (int)bysz); int* crude_array = new int[n]; printf("crude array size %d\n", n); clock_t s, e; int nn = 0; s = clock(); for (int r = 0; r < 100; r++) { srand(9); bcp.reset(); for (int i = 0; i < n; i++) bcp.set(rand() % n); for (int i = 0; i < n; i++) if (bcp.test(i)) nn++; } e = clock(); cout << "nn = " << nn << " t = " << e - s << '\n'; nn = 0; s = clock(); for (int r = 0; r < 100; r++) { srand(9); memset(bc, 0, bysz); for (int i = 0; i < n; i++) { int bi = rand() % n; bc[bi / ebits] |= (1UL << (bi % ebits)); } for (int i = 0; i < n; i++) { if (bc[i / ebits] & (1UL << (i % ebits))) nn++; } } e = clock(); cout << "nn = " << nn << " t = " << e - s << '\n'; nn = 0; s = clock(); for (int r = 0; r < 100; r++) { srand(9); memset(crude_array, 0, sizeof(crude_array)); for (int i = 0; i < n; i++) { int bi = rand() % n; crude_array[bi] = 1; } for (int i = 0; i < n; i++) { if (crude_array[i]) nn++; } } e = clock(); cout << "nn = " << nn << " t = " << e - s << '\n'; delete[] crude_array; }
Result:
bcp 125000 bc sz = 31250 ebits = 32 bysz = 125000 crude array size 1000000 nn = 3276800 t = 2158 nn = 3276800 t = 2067 nn = 3276800 t = 1979
Repeated test runs show consistent results.
I note that the fact that the “naive” implementation turned out to be the fastest is not so self-evident. Indeed, she suffers a cache locality, so she could well be slower than others. Therefore, the advice is in any case to profile, profile and profile your application code again on real data. "Effort mind" to guess that it would be rather unreal (except for very very trivial cases).