I ve implemented the sieve of erastosthenes in C++20 using with the punching of the non-prime multiples done multithreaded:
#include <iostream>
#include <cstdlib>
#include <charconv>
#include <cstring>
#include <vector>
#include <algorithm>
#include <cmath>
#include <bit>
#include <fstream>
#include <string_view>
#include <thread>
#include <utility>
#include <new>
#include <span>
#include <array>
#include <cassert>
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) || defined(__clang__)
#include <cpuid.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 26495)
#endif
using namespace std;
#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif
size_t getL2Size();
bool smt();
int main( int argc, char **argv )
{
if( argc < 2 )
return EXIT_FAILURE;
auto parse = []( char const *str, auto &value )
{
bool hex = str[0] == 0 && (str[1] == x || str[1] == X );
str += 2 * hex;
auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 10 : 16 );
return ec == errc() && !*ptr;
};
size_t end;
if( !parse( argv[1], end ) )
return EXIT_FAILURE;
if( end < 2 || (ptrdiff_t)end++ < 0 )
throw bad_alloc();
unsigned
hc = jthread::hardware_concurrency(),
nThreads;
if( argc < 4 || !parse( argv[3], nThreads ) )
nThreads = hc;
if( !nThreads )
return EXIT_FAILURE;
using word_t = size_t;
constexpr size_t
BITS_PER_CL = CL_SIZE * 8,
BITS = sizeof(word_t) * 8;
auto bitEnd = []( size_t end ) { return end / 2 + (end & 1 ^ 1); };
size_t nBits = bitEnd( end );
union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / sizeof(word_t)]; ndi_words_cl() {} };
vector<ndi_words_cl> sieveCachelines( (nBits + BITS_PER_CL - 1) / BITS_PER_CL );
span<word_t> sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / BITS );
fill( sieve.begin(), sieve.end(), (word_t)-1 );
size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
auto punch = [&]( size_t start, size_t end, size_t prime, auto )
{
size_t bit = start / 2;
end = bitEnd( end );
if( bit >= end )
return;
if( prime >= BITS ) [[likely]]
do [[likely]]
sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
while( (bit += prime) < end );
else
{
auto pSieve = sieve.begin() + bit / BITS;
do [[likely]]
{
word_t
word = *pSieve,
mask = rotl( (word_t)-2, bit % BITS ),
oldMask;
do
word &= mask,
oldMask = mask,
mask = rotl( mask, prime % BITS ),
bit += prime;
while( mask < oldMask );
*pSieve++ = word;
} while( bit < end );
}
};
for( size_t bSqrt = bitEnd( sqrt ), bit = 1; bit < bSqrt; ++bit ) [[likely]]
{
auto pSieve = sieve.begin() + bit / BITS;
do [[likely]]
if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
{
bit += countr_zero( w );
break;
}
while( (bit = bit + BITS & -(ptrdiff_t)BITS) < bSqrt );
if( bit >= bSqrt ) [[unlikely]]
break;
size_t prime = 2 * bit + 1;
punch( prime * prime, sqrt, prime, false_type() );
}
auto scan = [&]( size_t start, size_t end, auto fn )
{
start /= 2;
end = bitEnd( end );
if( start >= end )
return;
auto pSieve = sieve.begin() + start / BITS;
for( size_t bit = start; ; )
{
word_t word = *pSieve++ >> bit % BITS;
bool hasBits = word;
for( unsigned gap; word; word >>= gap, word >>= 1 ) [[likely]]
{
gap = countr_zero( word );
if( (bit += gap) >= end ) [[unlikely]]
return;
fn( bit * 2 + 1, true_type() );
if( ++bit >= end ) [[unlikely]]
return;
}
if( bit >= end ) [[unlikely]]
break;
bit = (bit + BITS - hasBits) & -(ptrdiff_t)BITS;
}
};
static auto dbl = []( ptrdiff_t x ) { return (double)x; };
using range_t = pair<size_t, size_t>;
vector<pair<size_t, size_t>> ranges;
vector<jthread> threads;
ranges.reserve( nThreads );
auto initiate = [&]( size_t start, auto fn )
{
double threadRange = dbl( end - start ) / nThreads;
for( size_t t = nThreads, rEnd = end, trStart; t--; rEnd = trStart ) [[likely]]
{
trStart = start + (ptrdiff_t)((ptrdiff_t)t * threadRange + 0.5);
size_t trClStart = trStart & -(2 * CL_SIZE * 8);
trStart = trClStart >= sqrt ? trClStart : sqrt;
if( trStart < rEnd )
ranges.emplace_back( trStart, rEnd );
}
threads.reserve( ranges.size() - 1 );
for( range_t const &range : ranges )
if( &range != &ranges.back() )
threads.emplace_back( fn, range.first, range.second, true_type() );
else
fn( range.first, range.second, false_type() );
};
double maxCacheRange = dbl( getL2Size() * 8 * 2 ) / 3 * (smt() && nThreads > hc / 2 ? 1 : 2);
initiate( sqrt,
[&]( size_t rStart, size_t rEnd, auto )
{
double thisThreadRange = dbl( rEnd - rStart );
ptrdiff_t nCachePartitions = (ptrdiff_t)ceil( thisThreadRange / maxCacheRange );
double cacheRange = thisThreadRange / dbl( nCachePartitions );
for( size_t p = nCachePartitions, cacheEnd = rEnd, cacheStart; p--; cacheEnd = cacheStart ) [[likely]]
{
cacheStart = rStart + (ptrdiff_t)((double)(ptrdiff_t)p * cacheRange + 0.5);
cacheStart &= -(2 * CL_SIZE * 8);
cacheStart = cacheStart >= sqrt ? cacheStart : sqrt;
scan( 3, sqrt,
[&]( size_t prime, auto )
{
size_t start = (cacheStart + prime - 1) / prime * prime;
start = start & 1 ? start : start + prime;
punch( start, cacheEnd, prime, true_type() );
} );
}
} );
threads.resize( 0 );
if( argc >= 3 && !*argv[2] )
{
atomic<size_t> aPrimes( 1 );
initiate( 3,
[&]( size_t rStart, size_t rEnd, auto )
{
size_t nPrimes = 0;
scan( rStart, rEnd, [&]( size_t, auto ) { ++nPrimes; } );
aPrimes.fetch_add( nPrimes, memory_order::relaxed );
} );
cout << aPrimes.load( memory_order_relaxed ) << endl;
return EXIT_SUCCESS;
}
ofstream ofs;
ofs.exceptions( ofstream::failbit | ofstream::badbit );
ofs.open( argc >= 3 ? argv[2] : "primes.txt", ofstream::binary | ofstream::trunc );
constexpr size_t
BUF_SIZE = 0x100000,
AHEAD = 32;
union ndi_char { char c; ndi_char() {} };
vector<ndi_char> rawPrintBuf( BUF_SIZE + AHEAD - 1 );
span printBuf( &rawPrintBuf.front().c, &rawPrintBuf.back().c + 1 );
auto
bufBegin = printBuf.begin(),
bufEnd = bufBegin,
bufFlush = bufBegin + BUF_SIZE;
auto print = [&]() { ofs << string_view( bufBegin, bufEnd ); };
auto printPrime = [&]( size_t prime, auto )
{
auto [ptr, ec] = to_chars( &*bufEnd, &bufEnd[AHEAD - 1], prime );
if( ec != errc() ) [[unlikely]]
throw system_error( (int)ec, generic_category(), "converson failed" );
bufEnd = ptr - &*bufBegin + bufBegin; // pointer to iterator - NOP
*bufEnd++ =
;
if( bufEnd >= bufFlush ) [[unlikely]]
print(), bufEnd = bufBegin;
};
printPrime( 2, false_type() );
scan( 3, end, printPrime );
print();
}
array<unsigned, 4> cpuId( unsigned code )
{
array<unsigned, 4> regs;
#if defined(_MSC_VER)
__cpuid( (int *)®s[0], code );
#elif defined(__GNUC__) || defined(__clang__)
__cpuid(code, regs[0], regs[1], regs[2], regs[3]);
#endif
return regs;
}
bool smt()
{
if( cpuId( 0 )[0] < 1 )
return false;
return cpuId( 1 )[3] >> 28 & 1;
}
size_t getL2Size()
{
if( cpuId( 0x80000000u )[0] < 0x80000006u )
return 512ull * 1024;
return (cpuId( 0x80000006u )[2] >> 16) * 1024;
}
Almost all of the CPU time is spent in the multi-threaded code punching the non-prime multiples. Within the punching threads the bit range is partitioned to fit into the L2-cache. If the number of threads is beyond the half number of logical cores on a SMT-system the thread sub-partitions are half the size.
Ive just added storing only the odd primes, i.e no multiples of 2, thereby having about +87% more performance.
On my Ryzen 9 7950X Zen4 16-core CPU producing all primes up to 2 ^ 32, that s almost 210E6 primes, takes about 0.15 seconds on all cores with suppressed file output; on one core the algorithm takes about 1.73 seconds. The program s third parameter is the number of threads. With 16 threads on my 16 core Zen4-machine the code I get a scaling of 70% over one thread. Occupying further SMT siblings brings almost no further speedup as the code depends on the throughput of the L2-cache.
The file output is done with to_chars and my own buffering to speed up the I/O. Producing a 2.2GB file from the mentioned range takes about an additional two seconds on my computer (64GB, PCIe 4.0 SSD).