原始代码看起来像
for(i=0;i<20;i++){
if(){
do();
}
else{
num2 = _mm_set_pd(Phasor.imaginary, Phasor.real);
for(int k=0; k<SamplesIneachPeriodCeil[iterationIndex]; k++)
{
/*SamplesIneachPeriodCeil[iterationIndex] is in range of 175000*/
num1 = _mm_loaddup_pd(&OutSymbol[k].real);
num3 = _mm_mul_pd(num2, num1);
num1 = _mm_loaddup_pd(&OutSymbol[k].imaginary);
num2 = _mm_shuffle_pd(num2, num2, 1);
num4 = _mm_mul_pd(num2, num1);
num3 = _mm_addsub_pd(num3, num4);
num2 = _mm_shuffle_pd(num2, num2, 1);
num5 = _mm_set_pd(InSymbolInt8[k],InSymbolInt8[k] );
num6 = _mm_mul_pd(num3, num5);
num7 = _mm_set_pd(Out[k].imaginary,Out[k].real);
num8 = _mm_add_pd(num7,num6);
_mm_storeu_pd((double *)&Out[k], num8);
}
Out = Out + SamplesIneachPeriodCeil[iterationIndex];
}
}
这个代码给了我15毫微秒的速度
当我修改代码以包括 Openmp 时
注意:这里我只包括其他部分
else{
int size = SamplesIneachPeriodCeil[iterationIndex];
#pragma omp parallel num_threads(2) shared(size)
{
int start,end,tindex,tno,no_of_iteration;
tindex = omp_get_thread_num();
tno = omp_get_num_threads();
start = tindex * size / tno;
end = (1+ tindex)* size / tno ;
num2 = _mm_set_pd(Phasor.imaginary, Phasor.real);
int k;
for(k = start ; k < end; k++){
num1 = _mm_loaddup_pd(&OutSymbol[k].real);
num3 = _mm_mul_pd(num2, num1);
num1 = _mm_loaddup_pd(&OutSymbol[k].imaginary);
num2 = _mm_shuffle_pd(num2, num2, 1);
num4 = _mm_mul_pd(num2, num1);
num3 = _mm_addsub_pd(num3, num4);
//_mm_storeu_pd((double *)&newSymbol, num3);
num2 = _mm_shuffle_pd(num2, num2, 1);
num5 = _mm_set_pd(InSymbolInt8[k],InSymbolInt8[k] );
num6 = _mm_mul_pd(num3, num5);
num7 = _mm_set_pd(Out[k].imaginary,Out[k].real);
num8 = _mm_add_pd(num7,num6);
_mm_storeu_pd((double *)&Out[k], num8);
}
}
Out = Out + size;
}
这个代码显示的速度是30毫秒
所以我在想我是不是做错了什么