Question

I m试图在2**32下打印每本。现在,我用一个ool子,在做饭后,用一个 b子来建造一个ie子,然后用 prime子印刷。这只需要4分钟时间,才能印出10亿顶的 prime。这样做是否有更快的途径? 我的法典

#include <iostream>
#include <cstdlib>
#include <vector>
#include <math.h>

using namespace std;

int main(int argc, char **argv){
  long long limit = atoll(argv[1]);
  //cin >> limit;
  long long sqrtlimit = sqrt(limit);

  vector<bool> sieve(limit+1, false);

  for(long long n = 4; n <= limit; n += 2)
    sieve[n] = true;

  for(long long n=3; n <= sqrtlimit; n = n+2){
    if(!sieve[n]){
      for(long long m = n*n; m<=limit; m=m+(2*n))
        sieve[m] = true;
    }
  }

  long long last;
  for(long long i=limit; i >= 0; i--){
    if(sieve[i] == false){
      last = i;
      break;
    }
  }
  cout << last << endl;

  for(long long i=2;i<=limit;i++)
  {
    if(!sieve[i])
      if(i != last)
        cout<<i<<",";
      else
        cout<<i;
  }
  cout<<endl;

Answer 1

我在blog上讨论了生成大量素材的问题,我发现首批10亿素的总和为11138479445180497。我描述了四种不同的方法:

Brute force, testing each number starting from 2 using trial division.
采用2,3,5,7-wheel的候选人,然后在2,7和61个基地进行初步测试;这种方法只使用2,3,5,7-wheel, 而我仅够用2,832英亩,但不足以给你。
An algorithm due to Melissa O Neill that uses sieve embedded in a priority queue, which is quite slow.
伊托斯特内斯的分层座,它非常快,但需要空间来储存si子和ie子。

Answer 2

I ve implemented the sieve of erastosthenes in C++20 using with the punching of the non-prime multiples done multithreaded:

#include <iostream>
#include <cstdlib>
#include <charconv>
#include <cstring>
#include <vector>
#include <algorithm>
#include <cmath>
#include <bit>
#include <fstream>
#include <string_view>
#include <thread>
#include <utility>
#include <new>
#include <span>
#include <array>
#include <cassert>
#if defined(_MSC_VER)
    #include <intrin.h>
#elif defined(__GNUC__) || defined(__clang__)
    #include <cpuid.h>
#endif

#if defined(_MSC_VER)
    #pragma warning(disable: 26495)
#endif

using namespace std;

#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif

size_t getL2Size();
bool smt();

int main( int argc, char **argv )
{
    if( argc < 2 )
        return EXIT_FAILURE;
    auto parse = []( char const *str, auto &value )
    {
        bool hex = str[0] ==  0  && (str[1] ==  x  || str[1] ==  X );
        str += 2 * hex;
        auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 10 : 16 );
        return ec == errc() && !*ptr;
    };
    size_t end;
    if( !parse( argv[1], end ) )
        return EXIT_FAILURE;
    if( end < 2 || (ptrdiff_t)end++ < 0 )
        throw bad_alloc();
    unsigned
        hc = jthread::hardware_concurrency(),
        nThreads;
    if( argc < 4 || !parse( argv[3], nThreads ) )
        nThreads = hc;
    if( !nThreads )
        return EXIT_FAILURE;
    using word_t = size_t;
    constexpr size_t
        BITS_PER_CL = CL_SIZE * 8,
        BITS = sizeof(word_t) * 8;
    auto bitEnd = []( size_t end ) { return end / 2 + (end & 1 ^ 1); };
    size_t nBits = bitEnd( end );
    union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / sizeof(word_t)]; ndi_words_cl() {} };
    vector<ndi_words_cl> sieveCachelines( (nBits + BITS_PER_CL - 1) / BITS_PER_CL );
    span<word_t> sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / BITS );
    fill( sieve.begin(), sieve.end(), (word_t)-1 );
    size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
    auto punch = [&]( size_t start, size_t end, size_t prime, auto )
    {
        size_t bit = start / 2;
        end = bitEnd( end );
        if( bit >= end )
            return;
        if( prime >= BITS ) [[likely]]
            do [[likely]]
                sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
            while( (bit += prime) < end );
        else
        {
            auto pSieve = sieve.begin() + bit / BITS;
            do [[likely]]
            {
                word_t
                    word = *pSieve,
                    mask = rotl( (word_t)-2, bit % BITS ),
                    oldMask;
                do
                    word &= mask,
                    oldMask = mask,
                    mask = rotl( mask, prime % BITS ),
                    bit += prime;
                while( mask < oldMask );
                *pSieve++ = word;
            } while( bit < end );
        }
    };
    for( size_t bSqrt = bitEnd( sqrt ), bit = 1; bit < bSqrt; ++bit ) [[likely]]
    {
        auto pSieve = sieve.begin() + bit / BITS;
        do [[likely]]
            if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
            {
                bit += countr_zero( w );
                break;
            }
        while( (bit = bit + BITS & -(ptrdiff_t)BITS) < bSqrt );
        if( bit >= bSqrt ) [[unlikely]]
            break;
        size_t prime = 2 * bit + 1;
        punch( prime * prime, sqrt, prime, false_type() );
    }
    auto scan = [&]( size_t start, size_t end, auto fn )
    {
        start /= 2;
        end = bitEnd( end );
        if( start >= end )
            return;
        auto pSieve = sieve.begin() + start / BITS;
        for( size_t bit = start; ; )
        {
            word_t word = *pSieve++ >> bit % BITS;
            bool hasBits = word;
            for( unsigned gap; word; word >>= gap, word >>= 1 ) [[likely]]
            {
                gap = countr_zero( word );
                if( (bit += gap) >= end ) [[unlikely]]
                    return;
                fn( bit * 2 + 1, true_type() );
                if( ++bit >= end ) [[unlikely]]
                    return;
            }
            if( bit >= end ) [[unlikely]]
                break;
            bit = (bit + BITS - hasBits) & -(ptrdiff_t)BITS;
        }
    };
    static auto dbl = []( ptrdiff_t x ) { return (double)x; };
    using range_t = pair<size_t, size_t>;
    vector<pair<size_t, size_t>> ranges;
    vector<jthread> threads;
    ranges.reserve( nThreads );
    auto initiate = [&]( size_t start, auto fn )
    {
        double threadRange = dbl( end - start ) / nThreads;
        for( size_t t = nThreads, rEnd = end, trStart; t--; rEnd = trStart ) [[likely]]
        {
            trStart = start + (ptrdiff_t)((ptrdiff_t)t * threadRange + 0.5);
            size_t trClStart = trStart & -(2 * CL_SIZE * 8);
            trStart = trClStart >= sqrt ? trClStart : sqrt;
            if( trStart < rEnd )
                ranges.emplace_back( trStart, rEnd );
        }
        threads.reserve( ranges.size() - 1 );
        for( range_t const &range : ranges )
            if( &range != &ranges.back() )
                threads.emplace_back( fn, range.first, range.second, true_type() );
            else
                fn( range.first, range.second, false_type() );
    };
    double maxCacheRange = dbl( getL2Size() * 8 * 2 ) / 3 * (smt() && nThreads > hc / 2 ? 1 : 2);
    initiate( sqrt,
        [&]( size_t rStart, size_t rEnd, auto )
        {
            double thisThreadRange = dbl( rEnd - rStart );
            ptrdiff_t nCachePartitions = (ptrdiff_t)ceil( thisThreadRange / maxCacheRange );
            double cacheRange = thisThreadRange / dbl( nCachePartitions );
            for( size_t p = nCachePartitions, cacheEnd = rEnd, cacheStart; p--; cacheEnd = cacheStart ) [[likely]]
            {
                cacheStart = rStart + (ptrdiff_t)((double)(ptrdiff_t)p * cacheRange + 0.5);
                cacheStart &= -(2 * CL_SIZE * 8);
                cacheStart = cacheStart >= sqrt ? cacheStart : sqrt;
                scan( 3, sqrt,
                    [&]( size_t prime, auto )
                    {
                        size_t start = (cacheStart + prime - 1) / prime * prime;
                        start = start & 1 ? start : start + prime;
                        punch( start, cacheEnd, prime, true_type() );
                    } );
            }
        } );
    threads.resize( 0 );
    if( argc >= 3 && !*argv[2] )
    {
        atomic<size_t> aPrimes( 1 );
        initiate( 3,
            [&]( size_t rStart, size_t rEnd, auto )
            {
                size_t nPrimes = 0;
                scan( rStart, rEnd, [&]( size_t, auto ) { ++nPrimes; } );
                aPrimes.fetch_add( nPrimes, memory_order::relaxed );
            } );
        cout << aPrimes.load( memory_order_relaxed ) << endl;
        return EXIT_SUCCESS;
    }
    ofstream ofs;
    ofs.exceptions( ofstream::failbit | ofstream::badbit );
    ofs.open( argc >= 3 ? argv[2] : "primes.txt", ofstream::binary | ofstream::trunc );
    constexpr size_t
        BUF_SIZE = 0x100000,
        AHEAD = 32;
    union ndi_char { char c; ndi_char() {} };
    vector<ndi_char> rawPrintBuf( BUF_SIZE + AHEAD - 1 );
    span printBuf( &rawPrintBuf.front().c, &rawPrintBuf.back().c + 1 );
    auto
        bufBegin = printBuf.begin(),
        bufEnd = bufBegin,
        bufFlush = bufBegin + BUF_SIZE;
    auto print = [&]() { ofs << string_view( bufBegin, bufEnd ); };
    auto printPrime = [&]( size_t prime, auto )
    {
        auto [ptr, ec] = to_chars( &*bufEnd, &bufEnd[AHEAD - 1], prime );
        if( ec != errc() ) [[unlikely]]
            throw system_error( (int)ec, generic_category(), "converson failed" );
        bufEnd = ptr - &*bufBegin + bufBegin; // pointer to iterator - NOP
        *bufEnd++ =  
 ;
        if( bufEnd >= bufFlush ) [[unlikely]]
            print(), bufEnd = bufBegin;
    };
    printPrime( 2, false_type() );
    scan( 3, end, printPrime );
    print();
}

array<unsigned, 4> cpuId( unsigned code )
{
    array<unsigned, 4> regs;
#if defined(_MSC_VER)
    __cpuid( (int *)&regs[0], code );
#elif defined(__GNUC__) || defined(__clang__)
    __cpuid(code, regs[0], regs[1], regs[2], regs[3]);
#endif
    return regs;
}

bool smt()
{
    if( cpuId( 0 )[0] < 1 )
        return false;
    return cpuId( 1 )[3] >> 28 & 1;
}

size_t getL2Size()
{
    if( cpuId( 0x80000000u )[0] < 0x80000006u )
        return 512ull * 1024;
    return (cpuId( 0x80000006u )[2] >> 16) * 1024;
}

Almost all of the CPU time is spent in the multi-threaded code punching the non-prime multiples. Within the punching threads the bit range is partitioned to fit into the L2-cache. If the number of threads is beyond the half number of logical cores on a SMT-system the thread sub-partitions are half the size.
Ive just added storing only the odd primes, i.e no multiples of 2, thereby having about +87% more performance.
On my Ryzen 9 7950X Zen4 16-core CPU producing all primes up to 2 ^ 32, that s almost 210E6 primes, takes about 0.15 seconds on all cores with suppressed file output; on one core the algorithm takes about 1.73 seconds. The program s third parameter is the number of threads. With 16 threads on my 16 core Zen4-machine the code I get a scaling of 70% over one thread. Occupying further SMT siblings brings almost no further speedup as the code depends on the throughput of the L2-cache.
The file output is done with to_chars and my own buffering to speed up the I/O. Producing a 2.2GB file from the mentioned range takes about an additional two seconds on my computer (64GB, PCIe 4.0 SSD).

Answer 3

这或许会加快步伐:

#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>

int main()
{
    std::vector<unsigned long long> numbers;
    unsigned long long maximum = 4294967296;
    for (unsigned long long i = 2; i <= maximum; ++i)
    {
        if (numbers.empty())
        {
            numbers.push_back(i);
            continue;
        }

        if (std::none_of(numbers.begin(), numbers.end(), [&](unsigned long long p)
        {
            return i % p == 0;
        }))
        {
            numbers.push_back(i);
        }

    }

    std::cout << "Primes:  " << std::endl;
    std::copy(numbers.begin(), numbers.end(), std::ostream_iterator<int>(std::cout, " "));

    return 0;
}

这是伊里斯托斯群岛史上的反常现象(而不是从限制下的每个数目开始,消除多种因素,从2起,忽视到极限的多重因素)。

Answer 4

最快的方式可能是要有一个预先拟定的名单。

http://www.bigprimes.net/“rel=“nofollow”http://www.bigprimes.net/。首批14亿本金可供下载,其中应包括每台3 000亿以下金。

我假定装上双筒可能太长了,因为它在规模上是少数兆字节。

Answer 5

你们是否衡量了什么时候? 它是自发的,还是产出的撰写?

加速冲动的一个快速办法是停止对所有数字的担忧。哪怕只有一只大事,你很难加以编码。这把你阵列的面积削减一半,如果你在身体记忆的限度下重温,就会大有助益。

vector<bool> sieve((limit+1)/2, false);
...
  for(long long m = n*n/2; m<=limit/2; m=m+n)
    sieve[m] = true;

就产出本身而言,<编码>cout效率不高。采用<代码>itoa或一些相当的决定因素>,然后将<代码>cout.write用于产出。甚至可以进入旧学校,使用<条码>frite>,<条码>。

Answer 6

I wrote a fast way in C to output primes up to 4 billion in under three minutes using a single thread on my Ryzen 9 3900x. If you output it to a file, it ends up being 2.298GB and I think it uses about 20GB of memory to complete.

#include <stdlib.h>
#include <stdio.h>

#define ARRSIZE 4000000000
#define MAXCALC ARRSIZE/2

int main() {
    long int z;
    long int *arr = (long int*) malloc((ARRSIZE) * sizeof(long int));
    for (long int x=3;x <= MAXCALC; x++) {
        if (x % 10 == 3 || x % 10 == 7 || x % 10 == 9) {
            for (long int y=3; y < MAXCALC; y++){
                    z = x * y;
                    if (z < ARRSIZE)
                        arr[z] = 1;
                    else
                        break;
            }
        }
    }
    printf("2 3 5 ");
    for (long int x=7; x < ARRSIZE; x++) {
        if (x % 2 != 0 && x % 10 != 5)
            if (arr[x] != 1)
                printf("%ld ", x);
    }
    printf("
");

    free(arr);
    return EXIT_SUCCESS;
}

Answer 7

我撰写了《规则一》一文,即8.7秒,产出低于10亿。但是,如果你把首批40亿大豆或比这少的宝石ment掉的话,我是无法肯定的。我的解决办法是:

import numpy as np
from math import isqrt

def all_primes_less_than(n):
    is_prime = np.full(n,True)
    is_prime[:2] = False
    is_prime[4::2] = False
    for p in range(3,isqrt(n),2):
        if is_prime[p]: is_prime[p*p::p] = False
    return is_prime.nonzero()[0]

友情链接