Simplest Snippet
工作最简单者如下。 onlyNBitsSetRNG64
随机抽取64个轨道编号,rng
,并生成一个随机的64个轨道编号,完全等于n
。 (10个轨道中的Max) 将快速递进功能用在低于这一限额。
uint64_t onlyNBitsSetRNG64(uint64_t rng, unsigned n) {
uint64_t res = 1;
while(1) {
res = (res >> (rng & 63)) | (res << (64 - (rng & 63)));
rng >>= 6;
if ( ! (n -= 1)) return res;
res |= res + 1;
}
}
如果你只想生成一个32倍的随机编号,则使用<条码>,只使用以下的“NBitsSetRNG32<>代码”,其上限为12倍。
uint32_t onlyNBitsSetRNG32(uint64_t rng, unsigned n) {
uint32_t res = 1;
while(1) {
res = (res >> (rng & 31)) | (res << (32 - (rng & 31)));
rng >>= 5;
if ( ! (n -= 1)) return res;
res |= res + 1;
}
}
GB/s of speed
如果你想要C的固定电离层电离层电离层电离层,那么海合会使用以下代码,即利用海合会的汽车化逻辑和自动探测器支持CPU的特征(AVX2和AVX512),选择最快的道路。
#include <assert.h>
#include <stdint.h>
#include <stddef.h>
#include <limits.h>
#define RAND_INTN_NBIT(T, NAME, N)
T NAME { assert(((void)"N bits too big",N<=sizeof(rng)*8));
unsigned i = N ;
for (T res = 1; 1; res|=res+1) {
const int SZT = sizeof(T), MSK = SZT*8 - 1;
res = (res>>(rng&MSK)) | (res<<(MSK+1 - (rng&MSK)));
rng >>= 3+(SZT>1)+(SZT>2)+(SZT>4)+(SZT>8)+(SZT>16);
if ( ! (i -= 1)) return res;
} return ((void)"Note: unreachable", 0); }
#define _RINB_VECSET1(N,V) for(j=0;j<N;j++) V[j] = 1;
#define _RINB_VECRD(N,V) for(j=0;j<N;j++)memcpy(V+j,rand++,SZT);
#define _RINB_VECROL(N,V,S) for(j=0;j<N;j++) V[j]=(V[j]>>(S[j]&M1))|(V[j]<<(M-(S[j]&M1)));
#define _RINB_VECSHR(N,V) for(j=0;j<N;j++) V[j] >>= shft;
#define _RINB_VECAOR(N,V) for(j=0;j<N;j++) V[j] |= V[j] + 1;
#define _RINB_VECWR(N,T,O,F) for(j=0;j<N;j++)T[O*N+j] = F[j];
#define _RINB_SMB(N,V,T)
assert(((void)"length must be multiple 512", (len&511)==0));
uint8_t SZT=(uint8_t)sizeof(T), M=(uint8_t)SZT*8, M1=M-1;
T * rand=(T*)rng, * p=(T*)&out[0], *e=(T*)&out[len];
for(int k,i,j,shft=3+(SZT>1)+(SZT>2)+(SZT>4)+(SZT>8)+(SZT>16); p!=e; p+=4*V){
T a[V]={0},b[V]={0},c[V]={0},d[V]={0},aR[V]={0},bR[V]={0},cR[V]={0},dR[V]={0};
_RINB_VECSET1(V,a) _RINB_VECSET1(V,b) _RINB_VECSET1(V,c) _RINB_VECSET1(V,d)
int x=N, l=0; while(1) {
if((l-=shft)<0){l=M;_RINB_VECRD(V,aR)_RINB_VECRD(V,bR)_RINB_VECRD(V,cR)_RINB_VECRD(V,dR)}
_RINB_VECROL(V,a,aR) _RINB_VECROL(V,b,bR) _RINB_VECROL(V,c,cR) _RINB_VECROL(V,d,dR)
_RINB_VECSHR(V,aR) _RINB_VECSHR(V,bR) _RINB_VECSHR(V,cR) _RINB_VECSHR(V,dR)
if (!(x -= 1)) break;
_RINB_VECAOR(V,a) _RINB_VECAOR(V,b) _RINB_VECAOR(V,c) _RINB_VECAOR(V,d)
}
_RINB_VECWR(V,p,0,a)_RINB_VECWR(V,p,1,b)_RINB_VECWR(V,p,2,c)_RINB_VECWR(V,p,3,d)
}
#if __STDC_VERSION__ >= 202311L || defined(__GNUC__)
#define _RINTNBIT_SMIMPL(O,N,V) static void O{_RINB_SMB(N,V,typeof(*out))}
#elif defined(__cplusplus)
#define _RINTNBIT_SMIMPL(O,N,V) static void O{_RINB_SMB(N,V,decltype(*out))}
#else
#define _RINTNBIT_SMIMPL(O,N,V) static void O {
if(sizeof(*out)==8){ _RINB_SMB(N,V,uint64_t) }
else if(sizeof(*out)==4){ _RINB_SMB(N,V,uint32_t) }
else if(sizeof(*out)==2){ _RINB_SMB(N,V,uint16_t) }
else if(sizeof(*out)==1){ _RINB_SMB(N,V,uint8_t) }
else assert(((void)"needed type size not found", 0));}
#endif
#if defined(__x86_64__) && defined(__GNUC__)
#define _RINB_PRAGMA(str) _Pragma(str)
#ifdef __clang__
#define _RINTNBCA(x,...) __attribute__((target(x)))
#else
#define _RINTNBCA(...) __attribute__((target(__VA_ARGS__),
optimize("O3,tree-vectorize")))
#endif
#define _RAND_INTNBIT_VECIMPL(O, N)
_RINTNBCA("default") _RINTNBIT_SMIMPL(DEF ## O ,N,1)
_RINTNBCA("avx2,bmi2") _RINTNBIT_SMIMPL(AVX2 ## O ,N,8)
_RINTNBCA("avx512bw,avx512dq,avx512vl","prefer-vector-width=512")
_RINTNBIT_SMIMPL(A512 ## O ,N,16)
#define _RAND_INTNBIT_SEL(V,S) if(!S("cmov"))__builtin_cpu_init();
if(S("avx512dq")&&S("avx512vl")) V = A512 ## V;
else if(S("avx2")&&S("bmi2")) V = AVX2 ## V;
else V = DEF ## V; V(out,len,rng);
#define RAND_INTN_NBIT_STREAM(NAME, ARGS, N)
_RAND_INTNBIT_VECIMPL(NAME ARGS,N) static void SEL ## NAME ARGS;
void (*NAME) ARGS = SEL ## NAME ; static void SEL ## NAME ARGS {
_RAND_INTNBIT_SEL(NAME, __builtin_cpu_supports) }
#elif defined(__aarch64__) && defined(__GNUC__)
#define _RINB_PRAGMA(str) _Pragma(str)
#define RAND_INTN_NBIT_STREAM(O,A,N) _RINTNBIT_SMIMPL(REAL##O A,N,2)
void (*O) A = REAL ## O ;
#else
#define _RINB_PRAGMA(str)
#define RAND_INTN_NBIT_STREAM(O,A,N) _RINTNBIT_SMIMPL(REAL##O A,N,1)
void (*O) A = REAL ## O ;
#endif
// NBIT_RNG_SIZE => rng bytes needed for SZ-int stream LEN-long and N bits in each
#define INTN_OF_NBIT_RNG_SIZE(SZ, LEN, N) ((LEN * N)*3/4)
The above code is written with portability fallbacks but is untested on MSVC; probably a tweak or two at most and it should work.
Example usage:
// Usage example. Feel free to rename these as desired:
// - `uint32_t int32With4SetBits(uint32_t rng)` => get a random number with only 4 bits set from `rng`
// - `void (*int32With4SetBitsStream)(uint32_t* out, size_t len, void* rng)` => stream `len` of
// uint32_t to the `out` array, consuming `INTN_OF_NBIT_RNG_SIZE()` RNG bytes
RAND_INTN_NBIT(uint32_t, int32With4SetBits(uint32_t rng), 4)
RAND_INTN_NBIT_STREAM(int32With4SetBitsStream, (uint32_t* out, size_t len, void* rng), 4)
如果你的机器支持AVX512,那么AVX512将同时检查4 512个轨道病媒中的1632个轨道分类,达到每台2.7个周期,或者在我的机器上安装1 200mb/s,例如建立32个轨道分类器,每台只有4个轨道:
A512int32With4SetBitsStream:
test esi, 511
jne .L47
mov rcx, rdi
lea rdi, [rdi+rsi*4]
cmp rcx, rdi
je .L42
mov eax, 1
mov rsi, rdx
vpxor xmm9, xmm9, xmm9
vpbroadcastd zmm8, eax
mov eax, 31
vpbroadcastd zmm7, eax
.L39:
vpxor xmm13, xmm13, xmm13
vmovdqa32 zmm14, zmm8
vmovdqa32 zmm6, zmm8
mov edx, 4
vmovdqa32 zmm5, zmm8
vmovdqa32 zmm4, zmm8
vmovdqa32 zmm12, zmm13
xor eax, eax
vmovdqa32 zmm11, zmm13
vmovdqa32 zmm10, zmm13
.L38:
sub eax, 5
js .L48
.L37:
vpandd zmm2, zmm10, zmm7
vpandd zmm1, zmm11, zmm7
vpandd zmm0, zmm12, zmm7
vpsubd zmm3, zmm9, zmm2
vpsrld zmm10, zmm10, 5
vpandd zmm15, zmm13, zmm7
vpsrld zmm11, zmm11, 5
vpsrld zmm12, zmm12, 5
vpandd zmm3, zmm3, zmm7
vpsllvd zmm3, zmm4, zmm3
vpsrlvd zmm4, zmm4, zmm2
vpsubd zmm2, zmm9, zmm1
vpandd zmm2, zmm2, zmm7
vpsrld zmm13, zmm13, 5
vpsllvd zmm2, zmm5, zmm2
vpsrlvd zmm5, zmm5, zmm1
vpsubd zmm1, zmm9, zmm0
vpandd zmm1, zmm1, zmm7
vpord zmm3, zmm3, zmm4
vpsllvd zmm1, zmm6, zmm1
vpsrlvd zmm6, zmm6, zmm0
vpsubd zmm0, zmm9, zmm15
vpandd zmm0, zmm0, zmm7
vpsrlvd zmm15, zmm14, zmm15
vpord zmm2, zmm2, zmm5
vpsllvd zmm0, zmm14, zmm0
vpord zmm1, zmm1, zmm6
vpaddd zmm4, zmm3, zmm8
vpaddd zmm5, zmm2, zmm8
vpaddd zmm6, zmm1, zmm8
vpord zmm4, zmm4, zmm3
vpord zmm0, zmm0, zmm15
vpord zmm5, zmm5, zmm2
vpord zmm6, zmm6, zmm1
vpaddd zmm14, zmm0, zmm8
vpord zmm14, zmm14, zmm0
dec edx
jne .L38
vmovdqu32 ZMMWORD PTR [rcx], zmm3
add rcx, 256
vmovdqu32 ZMMWORD PTR [rcx-192], zmm2
vmovdqu32 ZMMWORD PTR [rcx-128], zmm1
vmovdqu32 ZMMWORD PTR [rcx-64], zmm0
cmp rdi, rcx
jne .L39
vzeroupper
.L42:
ret
在AARCH64 /ARM 64-bit方面,产生了以下优化的近地天体代码:
REALint32With4SetBitsStream:
tst x1, 511
beq .L2
stp x29, x30, [sp, -16]!
adrp x3, .LANCHOR0
adrp x1, .LC0
mov x29, sp
adrp x0, .LC1
add x3, x3, :lo12:.LANCHOR0
add x1, x1, :lo12:.LC0
add x0, x0, :lo12:.LC1
mov w2, 642
bl __assert_fail
.L2:
movi v16.4s, 0x1
add x1, x0, x1, lsl 2
movi v5.4s, 0x1f
.L3:
cmp x0, x1
beq .L12
movi v0.4s, 0x1
mov w4, 4
movi v4.4s, 0
mov w3, 0
mov v3.16b, v0.16b
mov v6.16b, v4.16b
.L5:
subs w3, w3, #5
bpl .L4
ldp q6, q4, [x2]
add x2, x2, 32
mov w3, 32
.L4:
and v7.16b, v6.16b, v5.16b
subs w4, w4, #1
and v1.16b, v4.16b, v5.16b
ushr v6.4s, v6.4s, 5
ushr v4.4s, v4.4s, 5
neg v7.4s, v7.4s
and v2.16b, v7.16b, v5.16b
sshl v2.4s, v3.4s, v2.4s
ushl v3.4s, v3.4s, v7.4s
orr v2.16b, v2.16b, v3.16b
neg v3.4s, v1.4s
and v1.16b, v3.16b, v5.16b
sshl v1.4s, v0.4s, v1.4s
ushl v0.4s, v0.4s, v3.4s
add v3.4s, v2.4s, v16.4s
orr v1.16b, v1.16b, v0.16b
orr v3.16b, v3.16b, v2.16b
add v0.4s, v1.4s, v16.4s
orr v0.16b, v0.16b, v1.16b
bne .L5
stp q2, q1, [x0]
add x0, x0, 32
b .L3
.L12:
ret
Explanation: add-or
我们算法的基本组成部分是很少使用的add-or:(a+ b) > > > >。
<>m>add-or上的中点是,当a
是两条之权力时,比方位数在1年时始终递增,除非注入流(即过去2年占32分之1):
a := 0b101101
b := 0b000100
a + b := 0b110001
addOr := (a + b) | a | b
addOr := 0b111101
基本上,人们可以想象,“addOr”操作是,除了在可以持有的第一个免费排位上搜索消音器外,还使用了以前确定的所有位置(借方或借方)。
Explanation: handling the carry
因此,我们的情况正与“附加”一起出现。 如今,这一概念最简单:
uint32_t addOneBitTo(uint32_t to, unsigned pos) {
uint32_t sum = to + (UINT32_C(1) << (pos & 31));
uint32_t carry = sum < to; // detect overflow into 33rd bit
sum += carry;
uint32_t addOr = sum | to;
return addOr;
}
* E/CN.6/2009/1。 当然,有所有轨道(即<代码>-1)。
Explanation: rotations
将结转到这笔钱中,会产生一种非依赖性,将非顺序执行限制在无收益的关键道路上,这样,就能够做得更好。 轮值。
我们可以不核对过量,而是轮换源比和增加比值,使增加的比值变成1,而我们只是双向的<条码>x+ 1 > 。
#define ROTATE32R(x,n) (x>>(n&31)) | (x<<(32-(n&31)))
#define ROTATE32L(x,n) (x<<(n&31)) | (x>>(32-(n&31)))
// gives THE SAME output as the previous `addOneBitTo`
uint32_t addOneBitTo(uint32_t to, unsigned pos) {
to = ROTATE32R(to, pos);
uint32_t sum = to + (UINT32_C(1) << 0); // now the new pos is 0
// no carry(!!!)
uint32_t addOr = sum | to;
addOr = ROTATE32L(addOr, pos); // restore the rotation
return addOr;
}
由于这是一个全国过渡政府,我们不关心产出,而只是随机分配和统一分配的产出,我们可以取消第二次轮换制,以生产以下一线建筑群。 在达到预期的分界线之前,一再要求采取随机立场。
#define ROTATE32R(x,n) (x>>(n&31)) | (x<<(32-(n&31)))
uint32_t randomAddBitTo(uint32_t to, unsigned pos) {
to |= 1 + to;
return ROTATE32R(to, pos);
}
That s all the explanation I have. I found nothing further of interest than this.